当前位置：首页 > news >正文

第2个小脚本：批量读取所有英文txt文章内容提取高频的单词

news 2025/7/19 7:50:55

在这里插入图片描述
如何把网站的英文文章内容下载到txt文件，请看第3个小脚本：批量下载某网站的所有英文内容到txt。

在这里插入图片描述
代码如下：

#第二步：从当前目录读取txt文件，并统计高频单词
import os
import re
from collections import Counter

def read_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    return text

def count_word(text):
    # 去除标点符号
    text = re.sub(r'[^\w\s]', '', text)
    # 转换成小写
    text = text.lower()
    # 切分成单词列表
    words = text.split()
    # 过滤掉不是英语单词的word
    words = [word for word in words if re.match(r'^[a-z]+$', word)]
    # 统计词频
    word_count = Counter(words)
    return word_count


if __name__ == '__main__':
    # 读取当前目录下所有的txt文件
    current_dir = os.getcwd()
    txt_files = [os.path.join(current_dir, file) for file in os.listdir(current_dir) if file.endswith('.txt')]
    # 遍历所有txt文件，统计词频
    word_count = Counter()
    for file_path in txt_files:
        text = read_txt(file_path)
        word_count += count_word(text)
    # 输出结果并保存到文件
    index = 0
    with open('high_frequency_words.txt', 'w', encoding='utf-8') as output_file:
        for word, count in word_count.most_common(5000):
            if 3 <= len(word) <= 15 and "__" not in word and count > 1:
                index += 1
                print(f"{index}. {word}: {count}")
                output_file.write(f"{word}\n")
    print('生成')