【Python NTLK自然语言处理库】
安装流程
import nltk
nltk.download()
运行后出现一个界面,然后按Download
Tokenize
###分词
from nltk.tokenize import word_tokenize
text = "The vendor paid $20,000,000."
tokens = word_tokenize(text)
print(tokens)
输出
['The', 'vendor', 'paid', '$', '20,000,000', '.']
###分句
import nltk
sents = "I am Angela. I am happy."
sens= nltk.sent_tokenize(sents)
print(sens)
输出
['I am Angela.', 'I am happy.']
###中文分词
from jieba import lcut
chinese_sentence = "我正在練習自然語言處理。"
chinese_tokens = lcut(chinese_sentence)
print(chinese_tokens)
输出
['我', '正在', '練習', '自然', '語言', '處理', '。']
停用词
过滤停用词
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
text = "I would like to watch movie."
tokens = word_tokenize(text) tokens
print(tokens)
stopwords_list = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stopwords_list]
print(filtered_tokens)
输出
['I', 'would', 'like', 'to', 'watch', 'movie', '.']
['would', 'like', 'watch', 'movie', '.']
标签
import nltk
sentence = "I am happy."
tokens = nltk.word_tokenize(sentence)
pos_tags = nltk.pos_tag(tokens)
print(pos_tags)
输出
[('I', 'PRP'), ('am', 'VBP'), ('happy', 'JJ'), ('.', '.')]
词频
import nltk
from nltk.corpus import stopwords
sentence="I would like to buy a book. The book was bought by me."
full_stop = "."
tokens = nltk.word_tokenize(sentence.lower())
stopwords_list = set(stopwords.words('english'))
stopwords_list.add(full_stop)
filtered_tokens = [word for word in tokens if word not in stopwords_list]
print(filtered_tokens)
freq = nltk.FreqDist(filtered_tokens)
for key,val in freq.items():print (str(key) + ':' + str(val))
standard_freq=freq.most_common(3)
print(standard_freq)
输出
['would', 'like', 'buy', 'book', 'book', 'bought']
would:1
like:1
buy:1
book:2
bought:1
[('book', 2), ('would', 1), ('like', 1)]