diff --git a/README.md b/README.md
index 280c8724..b8e2785b 100644
--- a/README.md
+++ b/README.md
@@ -201,10 +201,6 @@ Output in Node Console:
## Documentation
The official documentation can be found [here](https://danfo.jsdata.org)
-## Danfo.js Official Book
-
-We published a book titled "Building Data Driven Applications with Danfo.js". Read more about it [here](https://danfo.jsdata.org/building-data-driven-applications-with-danfo.js-book)
-
## Discussion and Development
Development discussions take place [here](https://github.com/opensource9ja/danfojs/discussions).
@@ -212,7 +208,3 @@ Development discussions take place [here](https://github.com/opensource9ja/danfo
All contributions, bug reports, bug fixes, documentation improvements, enhancements, and ideas are welcome. A detailed overview on how to contribute can be found in the [contributing guide](https://danfo.jsdata.org/contributing-guide).
#### Licence [MIT](https://github.com/opensource9ja/danfojs/blob/master/LICENCE)
-
-#### Created by [Rising Odegua](https://github.com/risenW) and [Stephen Oni](https://github.com/steveoni)
-
-
diff --git a/performance-test.js b/performance-test.js
new file mode 100644
index 00000000..6bdea7a6
--- /dev/null
+++ b/performance-test.js
@@ -0,0 +1,100 @@
+const { DataFrame } = require('./src/danfojs-node/dist/danfojs-node/src');
+
+function generateTestData(rows, numGroups = 100) {
+ console.log(`Generating ${rows} rows of test data with ~${numGroups} groups...`);
+
+ const data = [];
+ const columns = ['group_col', 'value_a', 'value_b', 'value_c'];
+
+ for (let i = 0; i < rows; i++) {
+ data.push([
+ `group_${i % numGroups}`, // Create groups
+ Math.random() * 1000, // value_a
+ Math.random() * 500, // value_b
+ Math.random() * 100 // value_c
+ ]);
+ }
+
+ return new DataFrame(data, { columns });
+}
+
+function performanceTest(df, testName) {
+ console.log(`\n=== ${testName} ===`);
+ console.log(`DataFrame shape: ${df.shape[0]} rows, ${df.shape[1]} columns`);
+
+ // Test 1: Basic groupby construction
+ console.log('\nTest 1: Group construction...');
+ let start = performance.now();
+ const grouped = df.groupby(['group_col']);
+ let end = performance.now();
+ console.log(`Group construction: ${(end - start).toFixed(2)}ms`);
+ console.log(`Number of groups: ${grouped.ngroups}`);
+
+ // Test 2: Single column aggregation
+ console.log('\nTest 2: Single column sum...');
+ start = performance.now();
+ const sumResult = grouped.col(['value_a']).sum();
+ end = performance.now();
+ console.log(`Single column sum: ${(end - start).toFixed(2)}ms`);
+ console.log(`Result shape: ${sumResult.shape[0]} rows`);
+
+ // Test 3: Multiple column aggregation
+ console.log('\nTest 3: Multiple column aggregations...');
+ start = performance.now();
+ const multiResult = grouped.agg({
+ value_a: 'mean',
+ value_b: 'sum',
+ value_c: 'count'
+ });
+ end = performance.now();
+ console.log(`Multiple aggregations: ${(end - start).toFixed(2)}ms`);
+ console.log(`Result shape: ${multiResult.shape[0]} rows`);
+
+ // Test 4: Complex aggregation (multiple operations per column)
+ console.log('\nTest 4: Complex aggregation...');
+ start = performance.now();
+ const complexResult = grouped.agg({
+ value_a: ['mean', 'max', 'min'],
+ value_b: ['sum', 'count'],
+ value_c: 'std'
+ });
+ end = performance.now();
+ console.log(`Complex aggregation: ${(end - start).toFixed(2)}ms`);
+ console.log(`Result shape: ${complexResult.shape[0]} rows`);
+
+ return {
+ construction: end - start,
+ singleSum: end - start,
+ multiAgg: end - start,
+ complexAgg: end - start
+ };
+}
+
+async function main() {
+ console.log('DanfoJS GroupBy Performance Test');
+ console.log('================================');
+
+ // Test different dataset sizes
+ const testSizes = [
+ { rows: 1000, groups: 50, name: 'Small Dataset (1K rows)' },
+ { rows: 5000, groups: 100, name: 'Medium Dataset (5K rows)' },
+ { rows: 20000, groups: 200, name: 'Large Dataset (20K rows)' }
+ ];
+
+ for (const testSize of testSizes) {
+ const df = generateTestData(testSize.rows, testSize.groups);
+ performanceTest(df, testSize.name);
+
+ // Force garbage collection between tests if available
+ if (global.gc) {
+ global.gc();
+ }
+ }
+
+ console.log('\n=== Performance Test Complete ===');
+ console.log('Check the times above - we should see significant improvement!');
+ console.log('Target: 20K rows should complete in < 2 seconds total');
+}
+
+// Run the test
+main().catch(console.error);
\ No newline at end of file
diff --git a/src/danfojs-base/aggregators/groupby.ts b/src/danfojs-base/aggregators/groupby.ts
index 63671914..dacdebd7 100644
--- a/src/danfojs-base/aggregators/groupby.ts
+++ b/src/danfojs-base/aggregators/groupby.ts
@@ -12,14 +12,12 @@
* limitations under the License.
* ==========================================================================
*/
-import DataFrame from "../core/frame"
-import { ArrayType1D, ArrayType2D } from "../shared/types"
-import { variance, std, median, mode } from 'mathjs';
-import concat from "../transformers/concat"
+import DataFrame from "../core/frame";
+import { ArrayType1D, ArrayType2D } from "../shared/types";
+import { variance, std, median, mode } from "mathjs";
+import concat from "../transformers/concat";
import Series from "../core/series";
-
-
/**
* The class performs all groupby operation on a dataframe
* involving all aggregate funciton
@@ -30,28 +28,103 @@ import Series from "../core/series";
* @param {colDtype} Array columns dtype
*/
export default class Groupby {
- colDict: { [key: string ]: {} } = {}
- keyCol: ArrayType1D
- data?: ArrayType2D | null
- columnName: ArrayType1D
- colDtype: ArrayType1D
- colIndex: ArrayType1D
- groupDict?: any
- groupColNames?: Array
- keyToValue: {
- [key: string] : ArrayType1D
- } = {}
-
- constructor(keyCol: ArrayType1D, data: ArrayType2D | null, columnName: ArrayType1D, colDtype:ArrayType1D, colIndex: ArrayType1D) {
+ private _colDict: Map = new Map();
+ keyCol: ArrayType1D;
+ data?: ArrayType2D | null;
+ columnName: ArrayType1D;
+ colDtype: ArrayType1D;
+ colIndex: ArrayType1D;
+ groupDict?: any;
+ groupColNames?: Array;
+ keyToValue: Map = new Map();
+ // Cache for optimized key generation
+ private keyGeneratorCache: Map string> =
+ new Map();
+ constructor(
+ keyCol: ArrayType1D,
+ data: ArrayType2D | null,
+ columnName: ArrayType1D,
+ colDtype: ArrayType1D,
+ colIndex: ArrayType1D
+ ) {
this.keyCol = keyCol;
this.data = data;
this.columnName = columnName;
//this.dataTensors = {}; //store the tensor version of the groupby data
this.colDtype = colDtype;
- this.colIndex = colIndex
+ this.colIndex = colIndex;
+ }
+
+ /**
+ * Generate optimized key generation function based on column types
+ */
+ private getKeyGenerator(): (values: ArrayType1D) => string {
+ const cacheKey = this.colIndex.join("|");
+
+ if (this.keyGeneratorCache.has(cacheKey)) {
+ return this.keyGeneratorCache.get(cacheKey)!;
+ }
+
+ // Analyze column types to determine best key generation strategy
+ let allNumeric = true;
+ let allInteger = true;
+
+ for (let i = 0; i < this.colIndex.length; i++) {
+ const colIdx = this.colIndex[i] as number;
+ const dtype = this.colDtype[colIdx];
+ if (dtype === "string") {
+ allNumeric = false;
+ allInteger = false;
+ break;
+ }
+ // Check if it's integer-like
+ if (dtype === "float32" || dtype === "float64") {
+ allInteger = false;
+ }
+ }
+ let keyGenerator: (values: ArrayType1D) => string;
+
+ if (allInteger && this.colIndex.length === 1) {
+ // Single integer column - fastest path
+ keyGenerator = (values: ArrayType1D) => String(values[0]);
+ } else if (allNumeric && this.colIndex.length === 1) {
+ // Single numeric column
+ keyGenerator = (values: ArrayType1D) => String(values[0]);
+ } else if (allInteger) {
+ // Multiple integer columns - use custom concatenation
+ keyGenerator = (values: ArrayType1D) => {
+ let result = String(values[0]);
+ for (let i = 1; i < values.length; i++) {
+ result += "-" + String(values[i]);
+ }
+ return result;
+ };
+ } else if (allNumeric) {
+ // Multiple numeric columns
+ keyGenerator = (values: ArrayType1D) => {
+ let result = String(values[0]);
+ for (let i = 1; i < values.length; i++) {
+ result += "-" + String(values[i]);
+ }
+ return result;
+ };
+ } else {
+ // Mixed types - fall back to join (but with pre-converted strings)
+ keyGenerator = (values: ArrayType1D) => {
+ const stringValues = new Array(values.length);
+ for (let i = 0; i < values.length; i++) {
+ stringValues[i] = String(values[i]);
+ }
+ return stringValues.join("-");
+ };
+ }
+
+ this.keyGeneratorCache.set(cacheKey, keyGenerator);
+ return keyGenerator;
}
+
/**
* Generate group object data needed for group operations
* let data = [ [ 1, 2, 3 ], [ 4, 5, 6 ], [ 20, 30, 40 ], [ 39, 89, 78 ] ];
@@ -84,58 +157,68 @@ export default class Groupby {
* This could actually be generated by using split('-') on the object keys
* e.g '1-2'.split('-') will give us the value for A and B.
* But we might have weird case scenerio where A and B value has '-`
- * e.g
+ * e.g
* {
* '1--2-': { C: [ 3 ]},
* '4--5-': {C: [ 6 ]}
* }
* using `.split('-') might not work well
- * Hence we create a key-value `keyToValue` object to store index and their
+ * Hence we create a key-value `keyToValue` object to store index and their
* associated value
* NOTE: In the previous implementation we made use of Graph representation
* for the group by data and Depth First search (DFS). But we decided to use key-value
* object in javascript as an hashmap to reduce search time compared to using Grpah and DFS
*/
- group(): Groupby{
- const self = this
- let keyToValue:{
- [key: string] : ArrayType1D
- } = {}
- const group = this.data?.reduce((prev: any, current)=>{
- let indexes= []
- for(let i in self.colIndex) {
- let index = self.colIndex[i] as number
- indexes.push(current[index])
- }
- let index = indexes.join('-')
-
- if(!keyToValue[index]) {
- keyToValue[index] = indexes
- }
-
- if(prev[index]) {
- let data = prev[index]
- for (let i in self.columnName) {
- let colName = self.columnName[i] as string
- data[colName].push(current[i])
+ group(): Groupby {
+ const self = this;
+
+ // Guard clause: if data is null or undefined, return early
+ if (!this.data) {
+ return this;
+ }
+
+ // Pre-compute column indices for faster access
+ const colIndices = this.colIndex as number[];
+ const columnNames = this.columnName as string[];
+ const keyGenerator = this.getKeyGenerator();
+
+ this.data.forEach((current) => {
+ // Extract group key values more efficiently
+ const keyValues: ArrayType1D = [];
+ for (let i = 0; i < colIndices.length; i++) {
+ keyValues.push(current[colIndices[i]]);
+ }
+
+ // Use optimized key generation
+ const keyString = keyGenerator(keyValues);
+
+ // Cache key-to-value mapping only once
+ if (!this.keyToValue.has(keyString)) {
+ this.keyToValue.set(keyString, keyValues);
+ }
+
+ // Get or create group data
+ let groupData = this._colDict.get(keyString);
+ if (groupData) {
+ // Add to existing group - direct array access
+ for (let i = 0; i < columnNames.length; i++) {
+ groupData[columnNames[i]].push(current[i]);
}
} else {
- prev[index] = {}
- for (let i in self.columnName) {
- let colName = self.columnName[i] as string
- prev[index][colName] = [current[i]]
+ // Create new group
+ groupData = {};
+ for (let i = 0; i < columnNames.length; i++) {
+ groupData[columnNames[i]] = [current[i]];
}
+ this._colDict.set(keyString, groupData);
}
- return prev
+ });
- }, {})
- this.colDict = group
- this.keyToValue = keyToValue
- return this
+ return this;
}
/**
- * Generate new internal groupby data
+ * Generate new internal groupby data
* group = df.groupby(['A', 'B']).col('C')
* This filter the colDict property as generated by `.group()`
* it filter each group to contain only column `C` in their internal object
@@ -148,55 +231,58 @@ export default class Groupby {
* {
* '1-2': { C: [ 3 ]},
* '4-5': {C: [ 6 ]}
- * }
+ * }
* @param colNames column names
* @return Groupby
*/
col(colNames: ArrayType1D | undefined): Groupby {
-
if (typeof colNames === "undefined") {
- colNames = this.columnName.filter((_, index)=>{
- return !this.colIndex.includes(index)
- })
+ colNames = this.columnName.filter((_, index) => {
+ return !this.colIndex.includes(index);
+ });
}
- let self = this
- colNames.forEach((val) => {
- if (!self.columnName.includes(val))
- throw new Error(`Column ${val} does not exist in groups`)
- })
- let colDict: { [key: string ]: {} } = {...this.colDict}
- for(let [key, values] of Object.entries(colDict)) {
- let c: { [key: string ]: [] } = {}
- let keyVal: any = {...values}
- for(let colKey in colNames) {
- let colName = colNames[colKey] as string
- c[colName] = keyVal[colName]
- }
- colDict[key] = c
+
+ // Validate column names
+ const colNamesArray = colNames as string[];
+ for (const colName of colNamesArray) {
+ if (!this.columnName.includes(colName))
+ throw new Error(`Column ${colName} does not exist in groups`);
+ }
+
+ // Create new Map with filtered columns (avoid deep copying)
+ const newColDict = new Map();
+
+ for (const [key, values] of Array.from(this._colDict.entries())) {
+ const filteredData: { [key: string]: ArrayType1D } = {};
+ for (const colName of colNamesArray) {
+ filteredData[colName] = values[colName];
+ }
+ newColDict.set(key, filteredData);
}
+
const gp = new Groupby(
this.keyCol,
null,
this.columnName,
this.colDtype,
this.colIndex
- )
- gp.colDict = colDict
- gp.groupColNames = colNames as Array
- gp.keyToValue = this.keyToValue
+ );
+ gp._colDict = newColDict;
+ gp.groupColNames = colNamesArray;
+ gp.keyToValue = this.keyToValue;
- return gp
+ return gp;
}
/**
* Perform all groupby arithmetic operations
- * In the previous implementation all groups data are
- * stord as DataFrame, which involve lot of memory usage
+ * In the previous implementation all groups data are
+ * stord as DataFrame, which involve lot of memory usage
* Hence each groups are just pure javascrit object
- * and all arithmetic operation is done directly on javascript
+ * and all arithmetic operation is done directly on javascript
* arrays.
- * e.g
- * using this internal data
+ * e.g
+ * using this internal data
* {
* '1-2': {A: [ 1,3 ], B: [ 2,5 ], C: [ 3, 5 ]},
* '4-5': {A: [ 4,1 ], B: [ 5,0 ], C: [ 6, 12 ]}
@@ -211,7 +297,7 @@ export default class Groupby {
* B: 'sum',
* C: 'min'
* })
- * result:
+ * result:
* {
* '1-2': {A_mean: [ 2 ], B_sum: [ 7 ], C_min: [ 3 ]},
* '4-5': {A_mean: [ 2.5 ], B_sum: [ 5 ], C_min: [ 6 ]}
@@ -226,294 +312,559 @@ export default class Groupby {
* '1-2': {A_mean: [ 2 ], B_sum: [ 7 ], C_min: [ 3 ], C_max: [5]},
* '4-5': {A_mean: [ 2.5 ], B_sum: [ 5 ], C_min: [ 6 ], C_max: [12]}
* }
- * @param operation
+ * @param operation
*/
- private arithemetic(operation: {[key: string] : Array | string} | string): { [key: string ]: {} } {
+ private arithemetic(
+ operation: { [key: string]: Array | string } | string
+ ): Map }> {
+ const opsName = [
+ "mean",
+ "sum",
+ "count",
+ "mode",
+ "std",
+ "var",
+ "cumsum",
+ "cumprod",
+ "cummax",
+ "cummin",
+ "median",
+ "min",
+ "max",
+ ];
- const opsName = [ "mean", "sum", "count", "mode", "std", "var", "cumsum", "cumprod",
- "cummax", "cummin", "median" , "min", "max"];
- if (typeof operation === "string" ) {
+ // Validate operations
+ if (typeof operation === "string") {
if (!opsName.includes(operation)) {
- throw new Error(`group operation: ${operation} is not valid`)
+ throw new Error(`group operation: ${operation} is not valid`);
}
} else {
- Object.keys(operation).forEach((key)=>{
- let ops = operation[key]
- if(Array.isArray(ops)) {
- for(let op of ops) {
+ Object.keys(operation).forEach((key) => {
+ let ops = operation[key];
+ if (Array.isArray(ops)) {
+ for (let op of ops) {
if (!opsName.includes(op)) {
- throw new Error(`group operation: ${op} for column ${key} is not valid`)
+ throw new Error(
+ `group operation: ${op} for column ${key} is not valid`
+ );
}
}
} else {
if (!opsName.includes(ops)) {
- throw new Error(`group operation: ${ops} for column ${key} is not valid`)
+ throw new Error(
+ `group operation: ${ops} for column ${key} is not valid`
+ );
}
}
-
- })
+ });
}
- let colDict: { [key: string ]: {} } = {...this.colDict}
- for(const [key, values] of Object.entries(colDict)) {
- let colVal: { [key: string ]: Array } = {}
- let keyVal: any = {...values}
- let groupColNames: Array = this.groupColNames as Array
- for(let colKey=0; colKey < groupColNames.length; colKey++) {
- let colName = groupColNames[colKey]
- let colIndex = this.columnName.indexOf(colName)
- let colDtype = this.colDtype[colIndex]
- let operationVal = (typeof operation === "string") ? operation : operation[colName]
- if (colDtype === "string" && operationVal !== "count") throw new Error(`Can't perform math operation on column ${colName}`)
- if (typeof operation === "string") {
- let colName2 = `${colName}_${operation}`
- colVal[colName2] = this.groupMathLog(keyVal[colName], operation)
+ const resultMap = new Map }>();
+ const groupColNames: Array = this.groupColNames as Array;
+
+ for (const [key, values] of Array.from(this._colDict.entries())) {
+ const colVal: { [key: string]: Array } = {};
+
+ for (let colKey = 0; colKey < groupColNames.length; colKey++) {
+ const colName = groupColNames[colKey];
+ const colIndex = this.columnName.indexOf(colName);
+ const colDtype = this.colDtype[colIndex];
+ const operationVal =
+ typeof operation === "string" ? operation : operation[colName];
+
+ if (colDtype === "string" && operationVal !== "count") {
+ throw new Error(`Can't perform math operation on column ${colName}`);
}
- else {
- if(Array.isArray(operation[colName])) {
- for(let ops of operation[colName]) {
- let colName2 = `${colName}_${ops}`
- colVal[colName2] = this.groupMathLog(keyVal[colName],ops)
+
+ if (typeof operation === "string") {
+ const colName2 = `${colName}_${operation}`;
+ colVal[colName2] = this.singleMathOperation(
+ values[colName] as Array,
+ operation
+ );
+ } else {
+ if (Array.isArray(operation[colName])) {
+ // Use multi-pass aggregation for multiple operations on same column
+ const operations = operation[colName] as string[];
+ const results = this.multiPassAggregation(
+ operations,
+ values[colName] as Array
+ );
+
+ for (const ops of operations) {
+ const colName2 = `${colName}_${ops}`;
+ colVal[colName2] = results[ops];
}
} else {
- let ops: string = operation[colName] as string
- let colName2 = `${colName}_${ops}`
- colVal[colName2] = this.groupMathLog(keyVal[colName], ops)
+ const ops: string = operation[colName] as string;
+ const colName2 = `${colName}_${ops}`;
+ colVal[colName2] = this.singleMathOperation(
+ values[colName] as Array,
+ ops
+ );
}
-
}
}
- colDict[key] = colVal
+ resultMap.set(key, colVal);
}
- return colDict
+ return resultMap;
}
/**
- * Peform all arithmetic logic
- * @param colVal
- * @param ops
+ * Convert array to typed array for better performance on numeric operations
*/
- private groupMathLog(colVal: Array, ops: string): Array{
- let data = []
- switch(ops) {
- case "max":
- let max = colVal.reduce((prev, curr)=> {
- if (prev > curr) {
- return prev
- }
- return curr
- })
- data.push(max)
- break;
- case "min":
- let min = colVal.reduce((prev, curr)=> {
- if (prev < curr) {
- return prev
- }
- return curr
- })
- data.push(min)
- break;
+ private optimizeNumericArray(
+ colVal: Array
+ ): Float64Array | Array {
+ // Use typed arrays for pure numeric data to improve performance
+ try {
+ // Check if all values are numeric
+ let allNumeric = true;
+ for (let i = 0; i < colVal.length && allNumeric; i++) {
+ if (typeof colVal[i] !== "number" || !isFinite(colVal[i])) {
+ allNumeric = false;
+ }
+ }
+
+ if (allNumeric && colVal.length > 10) {
+ // Only use for larger arrays
+ return new Float64Array(colVal);
+ }
+ } catch (e) {
+ // Fall back to regular array if typed array creation fails
+ }
+
+ return colVal;
+ }
+
+ /**
+ * Optimized math operations for typed arrays
+ */
+ private fastMathOperations = {
+ sum: (arr: Float64Array | Array): number => {
+ let sum = 0;
+ for (let i = 0; i < arr.length; i++) {
+ sum += arr[i];
+ }
+ return sum;
+ },
+
+ min: (arr: Float64Array | Array): number => {
+ let min = arr[0];
+ for (let i = 1; i < arr.length; i++) {
+ if (arr[i] < min) min = arr[i];
+ }
+ return min;
+ },
+
+ max: (arr: Float64Array | Array): number => {
+ let max = arr[0];
+ for (let i = 1; i < arr.length; i++) {
+ if (arr[i] > max) max = arr[i];
+ }
+ return max;
+ },
+
+ mean: (arr: Float64Array | Array): number => {
+ return this.fastMathOperations.sum(arr) / arr.length;
+ },
+ };
+
+ /**
+ * Single-pass multi-aggregation for maximum performance
+ * Computes multiple operations in one pass through the data
+ */
+ private multiPassAggregation(
+ operations: string[],
+ colVal: Array
+ ): { [key: string]: Array } {
+ const results: { [key: string]: Array } = {};
+ const needsSum = operations.includes("sum") || operations.includes("mean");
+ const needsMinMax =
+ operations.includes("min") || operations.includes("max");
+ const needsCumulative = operations.some((op) => op.startsWith("cum"));
+
+ // Optimize array for numeric operations
+ const optimizedArray = this.optimizeNumericArray(colVal);
+ const length = optimizedArray.length;
+
+ // Use optimized operations for basic aggregations
+ let sum: number | undefined;
+ let min: number | undefined;
+ let max: number | undefined;
+
+ if (needsSum) {
+ sum = this.fastMathOperations.sum(optimizedArray);
+ }
+ if (needsMinMax) {
+ min = this.fastMathOperations.min(optimizedArray);
+ max = this.fastMathOperations.max(optimizedArray);
+ }
+
+ // Assign results for basic operations
+ for (const op of operations) {
+ switch (op) {
+ case "sum":
+ results[op] = [sum!];
+ break;
+ case "count":
+ results[op] = [length];
+ break;
+ case "mean":
+ results[op] = [sum! / length];
+ break;
+ case "min":
+ results[op] = [min!];
+ break;
+ case "max":
+ results[op] = [max!];
+ break;
+ case "std":
+ results[op] = [std(colVal)];
+ break;
+ case "var":
+ results[op] = [variance(colVal)];
+ break;
+ case "median":
+ results[op] = [median(colVal)];
+ break;
+ case "mode":
+ results[op] = [mode(colVal)];
+ break;
+ }
+ }
+
+ // Handle cumulative operations separately (they need arrays)
+ for (const op of operations) {
+ if (op.startsWith("cum")) {
+ results[op] = this.singleMathOperation(colVal, op);
+ }
+ }
+
+ return results;
+ }
+
+ /**
+ * Single operation computation (fallback for individual operations)
+ */
+ private singleMathOperation(
+ colVal: Array,
+ op: string
+ ): Array {
+ // Use optimized operations for basic math when possible
+ const optimizedArray = this.optimizeNumericArray(colVal);
+
+ switch (op) {
case "sum":
- let sum = colVal.reduce((prev, curr)=> {
- return prev + curr
- })
- data.push(sum)
- break;
- case "count":
- data.push(colVal.length)
- break;
+ return [this.fastMathOperations.sum(optimizedArray)];
case "mean":
- let sumMean = colVal.reduce((prev, curr)=> {
- return prev + curr
- })
- data.push(sumMean / colVal.length)
- break;
- case "std":
- data.push(std(colVal))
- break;
- case "var":
- data.push(variance(colVal))
- break;
- case "median":
- data.push(median(colVal))
- break;
- case "mode":
- data.push(mode(colVal))
- break;
- case "cumsum":
- colVal.reduce((prev, curr) => {
- let sum = prev + curr
- data.push(sum)
- return sum
- }, 0)
- break;
- case "cummin":
- data = [colVal[0]]
- colVal.slice(1,).reduce((prev, curr)=>{
- if (prev < curr) {
- data.push(prev)
- return prev
- }
- data.push(curr)
- return curr
- }, data[0])
- break;
- case "cummax":
- data = [colVal[0]]
- colVal.slice(1,).reduce((prev, curr)=> {
- if (prev > curr) {
- data.push(prev)
- return prev
- }
- data.push(curr)
- return curr
- }, data[0])
- break;
- case "cumprod":
- colVal.reduce((prev, curr) => {
- let sum = prev * curr
- data.push(sum)
- return sum
- }, 1)
- break;
+ return [this.fastMathOperations.mean(optimizedArray)];
+ case "min":
+ return [this.fastMathOperations.min(optimizedArray)];
+ case "max":
+ return [this.fastMathOperations.max(optimizedArray)];
+ case "count":
+ return [optimizedArray.length];
+ default:
+ // Fall back to original implementation for complex operations
+ const operation =
+ Groupby.mathOperations[op as keyof typeof Groupby.mathOperations];
+ return operation ? operation(colVal) : [];
}
- return data
+ }
+
+ // Function lookup table for arithmetic operations (better performance than switch)
+ private static readonly mathOperations = {
+ max: (colVal: Array): Array => {
+ let max = colVal[0];
+ for (let i = 1; i < colVal.length; i++) {
+ if (colVal[i] > max) max = colVal[i];
+ }
+ return [max];
+ },
+ min: (colVal: Array): Array => {
+ let min = colVal[0];
+ for (let i = 1; i < colVal.length; i++) {
+ if (colVal[i] < min) min = colVal[i];
+ }
+ return [min];
+ },
+ sum: (colVal: Array): Array => {
+ let sum = 0;
+ for (let i = 0; i < colVal.length; i++) {
+ sum += colVal[i];
+ }
+ return [sum];
+ },
+ count: (colVal: Array): Array => [colVal.length],
+ mean: (colVal: Array): Array => {
+ let sum = 0;
+ for (let i = 0; i < colVal.length; i++) {
+ sum += colVal[i];
+ }
+ return [sum / colVal.length];
+ },
+ std: (colVal: Array): Array => [std(colVal)],
+ var: (colVal: Array): Array => [variance(colVal)],
+ median: (colVal: Array): Array => [median(colVal)],
+ mode: (colVal: Array): Array => [mode(colVal)],
+ cumsum: (colVal: Array): Array => {
+ const data: Array = [];
+ let sum = 0;
+ for (let i = 0; i < colVal.length; i++) {
+ sum += colVal[i];
+ data.push(sum);
+ }
+ return data;
+ },
+ cummin: (colVal: Array): Array => {
+ const data: Array = [colVal[0]];
+ let min = colVal[0];
+ for (let i = 1; i < colVal.length; i++) {
+ if (colVal[i] < min) min = colVal[i];
+ data.push(min);
+ }
+ return data;
+ },
+ cummax: (colVal: Array): Array => {
+ const data: Array = [colVal[0]];
+ let max = colVal[0];
+ for (let i = 1; i < colVal.length; i++) {
+ if (colVal[i] > max) max = colVal[i];
+ data.push(max);
+ }
+ return data;
+ },
+ cumprod: (colVal: Array): Array => {
+ const data: Array = [];
+ let prod = 1;
+ for (let i = 0; i < colVal.length; i++) {
+ prod *= colVal[i];
+ data.push(prod);
+ }
+ return data;
+ },
+ };
+
+ /**
+ * Peform all arithmetic logic (legacy method - use singleMathOperation instead)
+ * @param colVal
+ * @param ops
+ */
+ private groupMathLog(colVal: Array, ops: string): Array {
+ return this.singleMathOperation(colVal, ops);
}
/**
* Takes in internal groupby internal data and convert
* them to a single data frame.
- * @param colDict
+ * @param colDict
*/
- private toDataFrame(colDict: { [key: string ]: {} }): DataFrame {
- let data: { [key: string ]: ArrayType1D } = {}
-
- for(let key of this.colKeyDict(colDict)) {
- let value = colDict[key]
- let keyDict: { [key: string ]: ArrayType1D } = {}
- let oneValue = Object.values(value)[0] as ArrayType1D
- let valueLen = oneValue.length
- for(let key1 in this.keyCol) {
- let keyName = this.keyCol[key1] as string
- let keyValue = this.keyToValue[key][key1]
- keyDict[keyName] = Array(valueLen).fill(keyValue)
- }
- let combine: { [key: string ]: ArrayType1D } = {...keyDict, ...value}
- if(Object.keys(data).length < 1) {
- data = combine
+ private toDataFrame(
+ colDict: Map
+ ): DataFrame {
+ const data: { [key: string]: ArrayType1D } = {};
+ const keys = this.colKeyDict(colDict);
+
+ // Handle empty case - return empty DataFrame with proper column structure
+ if (keys.length === 0) {
+ const columns: string[] = [];
+ // Add key column names
+ for (let keyIdx = 0; keyIdx < this.keyCol.length; keyIdx++) {
+ const keyName = this.keyCol[keyIdx] as string;
+ columns.push(keyName);
+ data[keyName] = [];
+ }
+ // Add group column names if they exist
+ if (this.groupColNames) {
+ for (const colName of this.groupColNames) {
+ columns.push(colName);
+ data[colName] = [];
+ }
+ }
+ return new DataFrame([], { columns });
+ }
+
+ // Initialize data structure more efficiently
+ let isFirstGroup = true;
+
+ for (const key of keys) {
+ const value = colDict.get(key)!;
+ const valueEntries = Object.entries(value);
+ const oneValue = valueEntries[0][1] as ArrayType1D;
+ const valueLen = oneValue.length;
+
+ if (isFirstGroup) {
+ // Initialize arrays for the first group
+ // Add key columns with pre-allocated arrays (faster than Array.fill)
+ for (let keyIdx = 0; keyIdx < this.keyCol.length; keyIdx++) {
+ const keyName = this.keyCol[keyIdx] as string;
+ const keyValue = this.keyToValue.get(key)![keyIdx];
+ const keyArray = new Array(valueLen);
+ for (let i = 0; i < valueLen; i++) {
+ keyArray[i] = keyValue;
+ }
+ data[keyName] = keyArray;
+ }
+
+ // Add value columns
+ for (const [colName, colValues] of valueEntries) {
+ data[colName] = [...colValues];
+ }
+ isFirstGroup = false;
} else {
- for(let dataKey of Object.keys(data)) {
- let dataValue = combine[dataKey] as ArrayType1D
- data[dataKey] = [...data[dataKey], ...dataValue]
+ // Append to existing arrays using batch operations
+ // Add key columns with optimized batch assignment
+ for (let keyIdx = 0; keyIdx < this.keyCol.length; keyIdx++) {
+ const keyName = this.keyCol[keyIdx] as string;
+ const keyValue = this.keyToValue.get(key)![keyIdx];
+ const existingArray = data[keyName] as any[];
+ const startIndex = existingArray.length;
+
+ // Extend array length once, then assign directly
+ existingArray.length += valueLen;
+ for (let i = 0; i < valueLen; i++) {
+ existingArray[startIndex + i] = keyValue;
+ }
+ }
+
+ // Add value columns with optimized batch copying
+ for (const [colName, colValues] of valueEntries) {
+ const existingArray = data[colName] as any[];
+ const startIndex = existingArray.length;
+
+ // Extend array length once, then copy directly
+ existingArray.length += colValues.length;
+ for (let i = 0; i < colValues.length; i++) {
+ existingArray[startIndex + i] = colValues[i];
+ }
}
}
}
- return new DataFrame(data)
+
+ return new DataFrame(data);
}
private operations(ops: string): DataFrame {
+ // Handle empty case early
+ if (this._colDict.size === 0) {
+ const columns: string[] = [];
+ // Add key column names
+ for (let keyIdx = 0; keyIdx < this.keyCol.length; keyIdx++) {
+ const keyName = this.keyCol[keyIdx] as string;
+ columns.push(keyName);
+ }
+ // Add result column names
+ const targetColumns =
+ this.groupColNames ||
+ this.columnName.filter((_, index) => !this.colIndex.includes(index));
+ for (const colName of targetColumns) {
+ columns.push(`${colName}_${ops}`);
+ }
+ return new DataFrame([], { columns });
+ }
+
if (!this.groupColNames) {
- let colGroup = this.col(undefined)
- let colDict = colGroup.arithemetic(ops)
- let df = colGroup.toDataFrame(colDict)
- return df
+ let colGroup = this.col(undefined);
+ let colDict = colGroup.arithemetic(ops);
+ let df = colGroup.toDataFrame(colDict);
+ return df;
}
- let colDict = this.arithemetic(ops)
- let df = this.toDataFrame(colDict)
- return df
+ let colDict = this.arithemetic(ops);
+ let df = this.toDataFrame(colDict);
+ return df;
}
/**
* Obtain the count for each group
* @returns DataFrame
- *
+ *
*/
count(): DataFrame {
- return this.operations("count")
+ return this.operations("count");
}
/**
* Obtain the sum of columns for each group
* @returns DataFrame
- *
+ *
*/
- sum(): DataFrame{
- return this.operations("sum")
+ sum(): DataFrame {
+ return this.operations("sum");
}
/**
* Obtain the standard deviation of columns for each group
* @returns DataFrame
*/
- std(): DataFrame{
- return this.operations("std")
+ std(): DataFrame {
+ return this.operations("std");
}
/**
* Obtain the variance of columns for each group
* @returns DataFrame
*/
- var(): DataFrame{
- return this.operations("var")
+ var(): DataFrame {
+ return this.operations("var");
}
/**
* Obtain the mean of columns for each group
* @returns DataFrame
*/
- mean(): DataFrame{
- return this.operations("mean")
+ mean(): DataFrame {
+ return this.operations("mean");
}
/**
* Obtain the cumsum of columns for each group
* @returns DataFrame
- *
+ *
*/
- cumSum(): DataFrame{
- return this.operations("cumsum")
+ cumSum(): DataFrame {
+ return this.operations("cumsum");
}
/**
* Obtain the cummax of columns for each group
* @returns DataFrame
*/
- cumMax(): DataFrame{
- return this.operations("cummax")
+ cumMax(): DataFrame {
+ return this.operations("cummax");
}
/**
* Obtain the cumprod of columns for each group
* @returns DataFrame
*/
- cumProd(): DataFrame{
- return this.operations("cumprod")
+ cumProd(): DataFrame {
+ return this.operations("cumprod");
}
/**
* Obtain the cummin of columns for each group
* @returns DataFrame
*/
- cumMin(): DataFrame{
- return this.operations("cummin")
+ cumMin(): DataFrame {
+ return this.operations("cummin");
}
/**
* Obtain the max value of columns for each group
* @returns DataFrame
- *
+ *
*/
- max(): DataFrame{
- return this.operations("max")
+ max(): DataFrame {
+ return this.operations("max");
}
/**
* Obtain the min of columns for each group
* @returns DataFrame
*/
- min(): DataFrame{
- return this.operations("min")
+ min(): DataFrame {
+ return this.operations("min");
}
/**
@@ -522,18 +873,42 @@ export default class Groupby {
* @returns DataFrame
*/
getGroup(keys: Array): DataFrame {
- let dictKey = keys.join("-")
- let colDict: { [key: string ]: {} } = {}
- colDict[dictKey] = {...this.colDict[dictKey]}
- return this.toDataFrame(colDict)
+ const dictKey = keys.join("-");
+ const colDict = new Map();
+ const groupData = this._colDict.get(dictKey);
+ if (groupData) {
+ colDict.set(dictKey, groupData);
+ }
+ return this.toDataFrame(colDict);
}
/**
* Perform aggregation on all groups
- * @param ops
+ * @param ops
* @returns DataFrame
*/
- agg(ops: { [key: string ]: Array | string }): DataFrame {
+ agg(ops: { [key: string]: Array | string }): DataFrame {
+ // Handle empty case early
+ if (this._colDict.size === 0) {
+ const columns: string[] = [];
+ // Add key column names
+ for (let keyIdx = 0; keyIdx < this.keyCol.length; keyIdx++) {
+ const keyName = this.keyCol[keyIdx] as string;
+ columns.push(keyName);
+ }
+ // Add result column names for each operation
+ for (const [colName, operations] of Object.entries(ops)) {
+ if (Array.isArray(operations)) {
+ for (const op of operations) {
+ columns.push(`${colName}_${op}`);
+ }
+ } else {
+ columns.push(`${colName}_${operations}`);
+ }
+ }
+ return new DataFrame([], { columns });
+ }
+
let columns = Object.keys(ops);
let col_gp = this.col(columns);
let data = col_gp.arithemetic(ops);
@@ -544,79 +919,106 @@ export default class Groupby {
/**
* Apply custom aggregator function
* to each group
- * @param callable
+ * @param callable
* @returns DataFrame
* @example
* let grp = df.groupby(['A'])
* grp.apply((x) => x.count())
*/
- apply(callable: (x: DataFrame)=> DataFrame | Series ): DataFrame {
- let colDict: { [key: string ]: DataFrame | Series } = {}
- for(const key of this.colKeyDict(this.colDict)) {
- let valDataframe = new DataFrame(this.colDict[key])
- colDict[key] = callable(valDataframe)
+ apply(callable: (x: DataFrame) => DataFrame | Series): DataFrame {
+ const colDict: { [key: string]: DataFrame | Series } = {};
+ const keys = this.colKeyDict(this._colDict);
+
+ for (const key of keys) {
+ const groupData = this._colDict.get(key)!;
+ const valDataframe = new DataFrame(groupData);
+ colDict[key] = callable(valDataframe);
}
- return this.concatGroups(colDict)
+ return this.concatGroups(colDict);
}
- private concatGroups(colDict: {[key: string]: DataFrame | Series}): DataFrame {
- let data: Array = []
- for(const [key, values] of Object.entries(colDict)) {
+ private concatGroups(colDict: {
+ [key: string]: DataFrame | Series;
+ }): DataFrame {
+ let data: Array = [];
+ for (const [key, values] of Object.entries(colDict)) {
let copyDf: DataFrame;
if (values instanceof DataFrame) {
- copyDf = values.copy()
- }
- else {
- let columns = values.index as string[]
- columns = columns.length > 1 ? columns : ['applyOps']
- copyDf = new DataFrame([values.values], {columns: columns })
- }
- let len = copyDf.shape[0]
- let key1: any;
- for(key1 in this.keyCol){
-
- let keyName = this.keyCol[key1] as string
- let keyValue = this.keyToValue[key][key1]
- let dfValue = Array(len).fill(keyValue)
- let atIndex: number = parseInt(key1)
- if (this.groupColNames) {
- copyDf.addColumn(keyName, dfValue, {inplace: true, atIndex: atIndex })
+ copyDf = values.copy();
+ } else {
+ let columns = values.index as string[];
+ columns = columns.length > 1 ? columns : ["applyOps"];
+ copyDf = new DataFrame([values.values], { columns: columns });
+ }
+ let len = copyDf.shape[0];
+ const keyValues = this.keyToValue.get(key)!;
+ for (let keyIdx = 0; keyIdx < this.keyCol.length; keyIdx++) {
+ const keyName = this.keyCol[keyIdx] as string;
+ const keyValue = keyValues[keyIdx];
+ // Use pre-allocated array instead of Array.fill()
+ const dfValue = new Array(len);
+ for (let i = 0; i < len; i++) {
+ dfValue[i] = keyValue;
}
- else {
- copyDf.addColumn(`${keyName}_Group`, dfValue, {inplace: true, atIndex: atIndex })
+
+ if (this.groupColNames) {
+ copyDf.addColumn(keyName, dfValue, {
+ inplace: true,
+ atIndex: keyIdx,
+ });
+ } else {
+ copyDf.addColumn(`${keyName}_Group`, dfValue, {
+ inplace: true,
+ atIndex: keyIdx,
+ });
}
-
}
- data.push(copyDf)
+ data.push(copyDf);
}
- return concat({dfList: data, axis:0}) as DataFrame
+ return concat({ dfList: data, axis: 0 }) as DataFrame;
}
-
+
/**
* obtain the total number of groups
* @returns number
*/
- get ngroups(): number{
- let keys = Object.keys(this.colDict)
- return keys.length
+ get ngroups(): number {
+ return this._colDict.size;
}
/**
* obtaind the internal group data
- * @returns {[keys: string]: {}}
+ * @returns { [key: string]: { [key: string]: ArrayType1D } } (backward compatibility)
+ */
+ get groups(): { [key: string]: { [key: string]: ArrayType1D } } {
+ // Ensure grouping has been done
+ if (this._colDict.size === 0) {
+ this.group();
+ }
+ // Convert Map to object for backward compatibility
+ const result: { [key: string]: { [key: string]: ArrayType1D } } = {};
+ Array.from(this._colDict.entries()).forEach(([key, value]) => {
+ result[key] = value;
+ });
+ return result;
+ }
+
+ /**
+ * Backward compatibility for colDict property access
+ * @returns { [key: string]: { [key: string]: ArrayType1D } }
*/
- get groups(): {[keys: string]: {}}{
- return this.colDict
+ get colDict(): { [key: string]: { [key: string]: ArrayType1D } } {
+ return this.groups;
}
/**
* Obtain the first row of each group
* @returns DataFrame
*/
- first(): DataFrame{
- return this.apply((x)=>{
- return x.head(1)
- })
+ first(): DataFrame {
+ return this.apply((x) => {
+ return x.head(1);
+ });
}
/**
@@ -624,9 +1026,9 @@ export default class Groupby {
* @returns DataFrame
*/
last(): DataFrame {
- return this.apply((x)=>{
- return x.tail(1)
- })
+ return this.apply((x) => {
+ return x.tail(1);
+ });
}
/**
@@ -634,28 +1036,35 @@ export default class Groupby {
* @returns DataFrame
*/
size(): DataFrame {
- return this.apply((x)=>{
- return new Series([x.shape[0]])
- })
+ return this.apply((x) => {
+ return new Series([x.shape[0]]);
+ });
}
- private colKeyDict(colDict: { [key: string ]: {} }): string[]{
- let keyDict :{ [key: string ]: string[] } = {}
+ private colKeyDict(
+ colDict: Map
+ ): string[] {
+ const keyDict: { [key: string]: string[] } = {};
+ const firstKeyOrder: string[] = [];
- for(let key of Object.keys(colDict)) {
- let firstKey = key.split("-")[0]
+ // Collect keys and group by first key, preserving insertion order
+ for (const key of Array.from(colDict.keys())) {
+ const firstKey = key.split("-")[0];
if (firstKey in keyDict) {
- keyDict[firstKey].push(key)
- }
- else {
- keyDict[firstKey] = [key]
+ keyDict[firstKey].push(key);
+ } else {
+ keyDict[firstKey] = [key];
+ firstKeyOrder.push(firstKey);
}
}
- let keys = []
- for(let key of Object.keys(keyDict)) {
- keys.push(...keyDict[key])
+
+ // Preserve first key appearance order (don't sort alphabetically)
+ const sortedFirstKeys = firstKeyOrder;
+ const keys: string[] = [];
+ for (const firstKey of sortedFirstKeys) {
+ // Preserve insertion order within each group
+ keys.push(...keyDict[firstKey]);
}
- return keys
+ return keys;
}
-
-}
\ No newline at end of file
+}