C++实现文件中单词统计等
采用C++的容器、迭代器、泛型算法等实现:从指定文本文件(英文文章)中读取单词,过滤掉指定过滤词,并统计剩下的单词出现次数。
使用了vector,string,set,map,istream_iterator,ostream_iterator, copy等STL元素。
//copy_clean_filter.cpp
//
//读取指定文本(英文)文件的内容,
//1. 输出其中不含标点和排除单词的所有单词
//2. 统计每个单词出现次数
//3. 按照次数排序依次输出不同频次对应的单词列表#include <iostream>
#include <fstream>
#include <vector>
#include <algorithm>
#include <iterator>
#include <string>
#include <cctype>
#include <unordered_set>
#include <map>
#include <numeric>
#include <windows.h>
using std::string;
using std::map;
using std::vector;
//using std::ordered_map;
// 1. 移除字符串中的标点符号
std::string remove_punctuation(const std::string& s) {std::string result;std::copy_if(s.begin(), s.end(), std::back_inserter(result),[](unsigned char c) { return !std::ispunct(c)&&!std::isdigit(c); });return result;
}// 2. 转换为小写(确保排除词匹配不区分大小写)
std::string to_lower(const std::string& s) {std::string result;std::transform(s.begin(), s.end(), std::back_inserter(result),[](unsigned char c) { return std::tolower(c); });return result;
}int main() {const std::unordered_set<std::string> exclude = {"a", "an", "the", "and", "but", "so", "or"," ","for","in","is","at","of","to","it","if","as"};//SetConsoleOutputCP(65001); // 设置控制台输出为 UTF-8//SetConsoleCP(65001); // 设置控制台输入为 UTF-8std::string filename;map<string, int> word_cnt;std::cout << "请输入文件名: ";std::cin >> filename;std::ifstream file(filename);if (!file) {std::cerr << "无法打开文件: " << filename << std::endl;return -1;}// 读取原始单词到临时容器std::vector<std::string> raw_words;std::copy(std::istream_iterator<std::string>(file),std::istream_iterator<std::string>(),std::back_inserter(raw_words));// 1. 处理:去标点 + 转小写std::vector<std::string> processed_words;std::transform(raw_words.begin(), raw_words.end(),std::back_inserter(processed_words),[](const std::string& s) {return to_lower(remove_punctuation(s));});// 2. 过滤:排除空字符串和禁用词std::vector<std::string> filtered_words;std::remove_copy_if(processed_words.begin(), processed_words.end(),std::back_inserter(filtered_words),[&exclude](const std::string& s) {return s.empty() || exclude.count(s);});// 输出结果std::cout << "\n处理后的单词列表:\n";for (const auto& w : filtered_words) {std::cout << w << " ";}// 统计每个单词出现次数for (auto& s : filtered_words)word_cnt[s]++;for (auto& s : word_cnt)std::cout << s.first << ":" << s.second << std::endl;std::cout << std::endl;//按 频次:对应单词 造表map<int, vector<string>> freq_words;for (auto& s : word_cnt)freq_words[s.second].push_back(s.first);std::cout << string("\n频次及对应单词列表") << std::endl;for (auto& s : freq_words){std::cout << s.first << "("<<s.second.size()<<")" << string(":");std::copy(s.second.begin(), s.second.end(),std::ostream_iterator<string>(std::cout, " "));std::cout << std::endl;}// 使用 std::for_each 替代 for 循环std::cout << string("\n使用 std::for_each 替代 for 循环") << std::endl;std::for_each(freq_words.begin(),freq_words.end(),[](const auto& s) { // Lambda 表达式处理每个键值对std::cout << s.first << "(" << s.second.size() << "): ";// 输出 vector 中的所有字符串std::copy(s.second.begin(),s.second.end(),std::ostream_iterator<std::string>(std::cout, " "));std::cout << std::endl;});//测试数值算法--与本例无关std::vector<int> vec(5);/*用一个起始值依次递增地为范围内的元素赋值,第一个参数是起始迭代器,第二个参数是结束迭代器,第三个参数是初始值,每次赋值后该值会自动加 1*/std::iota(vec.begin(), vec.end(), 0);std::copy(vec.begin(),vec.end(),std::ostream_iterator<int>(std::cout, " "));std::cout << std::endl;return 0;
}