优质的武进网站建设公司网站建设任务书
如何把网站的英文文章内容下载到txt文件,请看第3个小脚本:批量下载某网站的所有英文内容到txt。
代码如下:
#第二步:从当前目录读取txt文件,并统计高频单词
import os
import re
from collections import Counterdef read_txt(file_path):with open(file_path, 'r', encoding='utf-8') as f:text = f.read()return textdef count_word(text):# 去除标点符号text = re.sub(r'[^\w\s]', '', text)# 转换成小写text = text.lower()# 切分成单词列表words = text.split()# 过滤掉不是英语单词的wordwords = [word for word in words if re.match(r'^[a-z]+$', word)]# 统计词频word_count = Counter(words)return word_countif __name__ == '__main__':# 读取当前目录下所有的txt文件current_dir = os.getcwd()txt_files = [os.path.join(current_dir, file) for file in os.listdir(current_dir) if file.endswith('.txt')]# 遍历所有txt文件,统计词频word_count = Counter()for file_path in txt_files:text = read_txt(file_path)word_count += count_word(text)# 输出结果并保存到文件index = 0with open('high_frequency_words.txt', 'w', encoding='utf-8') as output_file:for word, count in word_count.most_common(5000):if 3 <= len(word) <= 15 and "__" not in word and count > 1:index += 1print(f"{index}. {word}: {count}")output_file.write(f"{word}\n")print('生成')