第2个小脚本:批量读取所有英文txt文章内容提取高频的单词
如何把网站的英文文章内容下载到txt文件,请看第3个小脚本:批量下载某网站的所有英文内容到txt。
代码如下:
#第二步:从当前目录读取txt文件,并统计高频单词
import os
import re
from collections import Counter
def read_txt(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()
return text
def count_word(text):
# 去除标点符号
text = re.sub(r'[^\w\s]', '', text)
# 转换成小写
text = text.lower()
# 切分成单词列表
words = text.split()
# 过滤掉不是英语单词的word
words = [word for word in words if re.match(r'^[a-z]+$', word)]
# 统计词频
word_count = Counter(words)
return word_count
if __name__ == '__main__':
# 读取当前目录下所有的txt文件
current_dir = os.getcwd()
txt_files = [os.path.join(current_dir, file) for file in os.listdir(current_dir) if file.endswith('.txt')]
# 遍历所有txt文件,统计词频
word_count = Counter()
for file_path in txt_files:
text = read_txt(file_path)
word_count += count_word(text)
# 输出结果并保存到文件
index = 0
with open('high_frequency_words.txt', 'w', encoding='utf-8') as output_file:
for word, count in word_count.most_common(5000):
if 3 <= len(word) <= 15 and "__" not in word and count > 1:
index += 1
print(f"{index}. {word}: {count}")
output_file.write(f"{word}\n")
print('生成')