Hello everyone! Today, I will guide you through implementing the powerful grouping and summarization functionality in Excel using C++.
1. Core Idea of Grouping and Summarization Functionality
The grouping and summarization in Excel mainly accomplishes three tasks:
- Grouping: Grouping data based on the values of specified columns
- Summarization: Performing statistical calculations (sum, average, etc.) for each group
- Display: Showing results in a hierarchical structure that can be expanded/collapsed
To implement this functionality in C++, we need:
- Data Grouping Algorithm: Efficiently categorizing data
- Aggregation Calculations: Implementing various statistical functions
- Tree Structure: Managing the hierarchical relationships of summarized results
2. Basic Data Structure Design
1. Defining Data Record Structure
#include <vector>
#include <string>
#include <variant>
using Value = std::variant<std::string, int, double>;
struct Record {
std::vector<Value> values; // Multiple fields of a record
};
class Dataset {
private:
std::vector<std::string> headers; // Column titles
std::vector<Record> records; // Data records
public:
void addRecord(const Record& record) {
records.push_back(record);
}
// Other methods...
};
2. Designing Summary Result Nodes
struct SummaryNode {
std::string groupKey; // Group key value
std::vector<double> summaries; // Summary values for each column
std::vector<SummaryNode> children;// Child nodes
// Adding summary data
void accumulate(const Record& record,
const std::vector<int>& summaryColumns) {
for(int i = 0; i < summaryColumns.size(); ++i) {
int col = summaryColumns[i];
if(auto pval = std::get_if<double>(&record.values[col])) {
summaries[i] += *pval;
}
}
}
};
3. Core Algorithm Implementation
1. Data Grouping Algorithm
#include <unordered_map>
SummaryNode buildSummaryTree(Dataset& dataset,
int groupColumn,
const std::vector<int>& summaryColumns) {
SummaryNode root;
root.groupKey = "Total";
root.summaries.resize(summaryColumns.size());
// Using hash table to speed up grouping
std::unordered_map<std::string, SummaryNode> groups;
for(auto& record : dataset.records) {
// Get group key
std::string key = std::get<std::string>(record.values[groupColumn]);
// Find or create group node
auto& groupNode = groups[key];
groupNode.groupKey = key;
if(groupNode.summaries.empty()) {
groupNode.summaries.resize(summaryColumns.size());
}
// Accumulate data
groupNode.accumulate(record, summaryColumns);
root.accumulate(record, summaryColumns);
}
// Add group nodes to root node
for(auto& [key, node] : groups) {
root.children.push_back(node);
}
return root;
}
Tip:
- Use
<span>unordered_map</span>
to achieve O(1) complexity for grouping - Support multi-column aggregation calculations
- Can be extended to multi-level grouping
2. Multi-Level Grouping and Summarization
SummaryNode buildMultiLevelSummary(Dataset& dataset,
const std::vector<int>& groupColumns,
const std::vector<int>& summaryColumns) {
SummaryNode root;
root.groupKey = "Total";
root.summaries.resize(summaryColumns.size());
// Recursively build grouping tree
std::function<void(SummaryNode&, int, const std::vector<Record>&)> buildLevel;
buildLevel = [&](SummaryNode& parent, int level, const std::vector<Record>& records) {
if(level >= groupColumns.size()) return;
std::unordered_map<std::string, std::vector<Record>> subGroups;
int groupCol = groupColumns[level];
// Group by current level
for(auto& record : records) {
std::string key = std::get<std::string>(record.values[groupCol]);
subGroups[key].push_back(record);
}
// Process each subgroup
for(auto& [key, groupRecords] : subGroups) {
SummaryNode child;
child.groupKey = key;
child.summaries.resize(summaryColumns.size());
// Calculate summary for current group
for(auto& record : groupRecords) {
child.accumulate(record, summaryColumns);
}
// Recursively process next level
buildLevel(child, level + 1, groupRecords);
parent.children.push_back(child);
}
};
buildLevel(root, 0, dataset.records);
// Calculate total
for(auto& record : dataset.records) {
root.accumulate(record, summaryColumns);
}
return root;
}
4. Result Display and Interaction
1. Console Output of Summary Tree
void printSummaryTree(const SummaryNode& node, int level = 0) {
// Indentation indicates level
std::string indent(level * 4, ' ');
// Print current node
std::cout << indent << node.groupKey << ": ";
for(auto sum : node.summaries) {
std::cout << sum << "\t";
}
std::cout << std::endl;
// Recursively print child nodes
for(auto& child : node.children) {
printSummaryTree(child, level + 1);
}
}
2. Interactive Expand/Collapse
class InteractiveSummaryViewer {
private:
std::vector<bool> expanded; // Record the expanded state of each level
public:
void printInteractive(const SummaryNode& node, int level = 0) {
std::string indent(level * 2, ' ');
// Expand/collapse marker
char marker = '+';
if(level < expanded.size() && expanded[level]) {
marker = '-';
}
// Print current node
if(level > 0) {
std::cout << indent << marker << " ";
}
std::cout << node.groupKey << std::endl;
// Process child nodes
if(level >= expanded.size() || expanded[level]) {
for(auto& child : node.children) {
printInteractive(child, level + 1);
}
}
}
void toggleLevel(int level) {
if(level >= expanded.size()) {
expanded.resize(level + 1);
}
expanded[level] = !expanded[level];
}
};
5. Advanced Feature Extensions
1. Support for Multiple Aggregation Functions
enum class AggregateFunc { SUM, AVG, COUNT, MAX, MIN };
class Aggregator {
public:
virtual void init() = 0;
virtual void accumulate(const Value& value) = 0;
virtual double getResult() = 0;
};
class SumAggregator :public Aggregator {
private:
double sum = 0;
public:
void init() override { sum = 0; }
void accumulate(const Value& value) override {
if(auto pval = std::get_if<double>(&value)) {
sum += *pval;
}
}
double getResult() override { return sum; }
};
// Use aggregator in SummaryNode
std::vector<std::unique_ptr<Aggregator>> aggregators;
2. Result Sorting Functionality
void sortSummaryTree(SummaryNode& node,
int sortColumn,
bool ascending = true) {
// Sort child nodes
std::sort(node.children.begin(), node.children.end(),
[&](const SummaryNode& a, const SummaryNode& b) {
double valA = a.summaries[sortColumn];
double valB = b.summaries[sortColumn];
return ascending ? valA < valB : valA > valB;
});
// Recursively sort child nodes' children
for(auto& child : node.children) {
sortSummaryTree(child, sortColumn, ascending);
}
}
3. Result Filtering Functionality
void filterSummaryTree(SummaryNode& node,
const std::function<bool(const SummaryNode&)> predicate) {
// Filter child nodes
auto it = std::remove_if(node.children.begin(), node.children.end(),
[&](const SummaryNode& child) {
return !predicate(child);
});
node.children.erase(it, node.children.end());
// Recursively filter
for(auto& child : node.children) {
filterSummaryTree(child, predicate);
}
}
6. Performance Optimization Techniques
1. Parallel Computation Optimization
#include <execution>
void parallelAccumulate(SummaryNode& node,
const std::vector<Record>& records,
const std::vector<int>& summaryColumns) {
std::vector<double> sums(summaryColumns.size(), 0);
std::for_each(std::execution::par, records.begin(), records.end(),
[&](const Record& record) {
for(int i = 0; i < summaryColumns.size(); ++i) {
int col = summaryColumns[i];
if(auto pval = std::get_if<double>(&record.values[col])) {
// Thread-safe accumulation
#pragma omp atomic
sums[i] += *pval;
}
}
});
node.summaries = sums;
}
7. Considerations
-
Memory Management:
- Consider using pointers or smart pointers for large datasets
- Avoid unnecessary copies
Type Safety:
// Add type checking
void accumulate(const Record& record) {
for(int col : summaryColumns) {
if(!std::holds_alternative<double>(record.values[col])) {
throw std::runtime_error("Non-numeric columns cannot be summed");
}
}
}
Extension Suggestions:
- Add result export functionality
- Support custom grouping logic
- Implement incremental updates for summary data
8. Hands-On Practice
Try these interesting exercises:
- Implement a function to display summary results with pagination
- Add support for grouping by date types (monthly/yearly)
- Implement visualization of summary results in charts
- Create summary operations that support undo/redo
Conclusion
Today we delved into:
- Efficient implementation of grouping and summarization algorithms
- Management of summarized results in a tree structure
- Interactive display and operations
- Performance optimization and advanced feature extensions
Grouping and summarization is a core functionality in data analysis, mastering its implementation principles can greatly enhance your data handling capabilities. It is recommended to start with simple single-level summaries and gradually implement more complex features.