Python计算豆瓣热门电影分类TF-IDF
环境
PyCharm 2020.2.4 (Professional Edition)
bs4==0.0.1
urllib3==1.24.2
MongoDB 3.4.10
Python 3.7.4
pymongo 3.11.3
TF
词频(Term Frequency, TF
)反映语料库中的词条
t
e
r
m
i
term_i
termi在文档
d
o
c
u
m
e
n
t
j
document_j
documentj中出现的频率
T F i , j = c o u n t i t o t a l _ c o u n t TF_{i,j} = \frac{count_i}{total\_count} TFi,j=total_countcounti
def calculate_TF(term_vectors, terms_count):
TFs = []
for term_vector in term_vectors:
TF = {}
for term, count in term_vector.items():
TF[term] = count / terms_count
TFs.append(TF)
return TFs
IDF
逆文本频率(Inverse Document Frequency, IDF
)反映含某词条
t
e
r
m
i
term_i
termi的文档
d
o
c
u
m
e
n
t
i
document_i
documenti在语料库中的比重,比重越小IDF
值越大,因为比重越小越能够在语料库中区分某个文档。
d
o
c
u
m
e
n
t
_
c
o
u
n
t
i
document\_count_i
document_counti表示包含词条
t
e
r
m
i
term_i
termi的文档数量,
d
o
c
u
m
e
n
t
t
o
t
a
l
c
o
u
n
t
document_total_count
documenttotalcount表示语料库中文档总数。
I D F i , j = log 10 d o c u m e n t _ c o u n t i d o c u m e n t _ t o t a l _ c o u n t IDF_{i,j} = \log_{10} \frac{document\_count_i}{document\_total\_count} IDFi,j=log10document_total_countdocument_counti
def calculate_IDF(documents, document_count):
idf = dict.fromkeys(documents[0], 0)
for document in documents:
for term, count in document.items():
if count > 0:
idf[term] += 1
for term, count in idf.items():
idf[term] = math.log10((document_count + 1) / (count + 1))
return idf
TF-IDF
TF-IDF是一种用于信息检索与数据挖掘的常用加权技术,用以评估一个词条对整个语料库中其中一份文档的重要程度,词条的重要性与其出现在文档中频率成正比(TF
),与其在语料库中所有文档中出现的频率成反比(IDF
)
T F _ I D F i , j = t f i , j × i d f i TF\_IDF_{i,j} = tf_{i,j} \times idf_{i} TF_IDFi,j=tfi,j×idfi
def calculate_TF_IDF(TFs, IDF):
TF_IDFs = []
for TF in TFs:
TF_IDF = {}
for term, frequency in TF.items():
TF_IDF[term] = frequency * IDF[term]
TF_IDFs.append(TF_IDF)
return TF_IDFs
实现代码
获取数据
import urllib
from bs4 import BeautifulSoup
import re
import pymongo
def crawl():
url = "https://movie.douban.com/top250?start="
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"}
image_url_pattern = re.compile(r'<img .*src="(.*?)".*>')
title_pattern = re.compile(r'<span class="title">(.*?)</span>')
information_pattern = re.compile(r'<p class="">(.*?)</p>', re.S)
rating_pattern = re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>')
evaluator_pattern = re.compile(r'<span>(.*?)人评价</span>')
synopsis_pattern = re.compile(r'<span class="inq">(.*?)</span>')
movies = []
for i in range(0, 10):
request = urllib.request.Request(url=url + str(25 * i), headers=headers)
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
beautiful_soup = BeautifulSoup(html, "html.parser")
for item in beautiful_soup.find_all('div', class_='item'):
movie = {}
item = str(item)
image_url = re.findall(image_url_pattern, item)[0]
movie["image_url"] = image_url
title = re.findall(title_pattern, item)[0]
movie["title"] = title
information = re.findall(information_pattern, item)
information_list = information[0].strip().replace('\xa0', ',').split(',')
movie["country"] = information_list[-3]
movie["category"] = information_list[-1]
rating = re.findall(rating_pattern, item)[0]
movie["rating"] = rating
evaluator = re.findall(evaluator_pattern, item)[0]
movie["evaluator"] = evaluator
synopsis = re.findall(synopsis_pattern, item)
if len(synopsis) > 0:
movie["synopsis"] = synopsis[0]
else:
movie["synopsis"] = "None"
movies.append(movie)
return movies
def save_in_mongodb(uri, database_name, collection_name, data):
mongo_client = pymongo.MongoClient(uri)
database = mongo_client[database_name]
collection = database[collection_name]
collection.insert_many(data)
if __name__ == "__main__":
movies = crawl()
save_in_mongodb('mongodb://localhost:27017', 'douban', 'movies', movies)
TF-IDF
import math
import pymongo
def query_from_mongodb(uri, database_name, collection_name):
mongo_client = pymongo.MongoClient(uri)
database = mongo_client[database_name]
collection = database[collection_name]
data = collection.find()
return data
def calculate_TF(term_vectors, terms_count):
# tf_vector = {}
# for term, count in term_vector.items():
# tf_vector[term] = count / terms_count
#
# return tf_vector
TFs = []
for term_vector in term_vectors:
TF = {}
for term, count in term_vector.items():
TF[term] = count / terms_count
TFs.append(TF)
return TFs
def calculate_IDF(documents, document_count):
idf = dict.fromkeys(documents[0], 0)
for document in documents:
for term, count in document.items():
if count > 0:
idf[term] += 1
for term, count in idf.items():
idf[term] = math.log10((document_count + 1) / (count + 1))
return idf
def calculate_TF_IDF(TFs, IDF):
TF_IDFs = []
for TF in TFs:
TF_IDF = {}
for term, frequency in TF.items():
TF_IDF[term] = frequency * IDF[term]
TF_IDFs.append(TF_IDF)
return TF_IDFs
if __name__ == '__main__':
movies = query_from_mongodb('mongodb://localhost:27017', 'douban', 'movies')
category_list = []
for movie in movies:
category_list.append(movie['category'])
category_terms_list = []
term_set = set([])
for category in category_list:
category_terms = category.split(" ")
category_terms_list.append(category_terms)
term_set = term_set.union(set(category_terms))
term_vectors = []
for category_terms in category_terms_list:
term_vector = dict.fromkeys(term_set, 0)
for term in category_terms:
term_vector[term] += 1
term_vectors.append(term_vector)
terms_count = len(term_set)
TFs = calculate_TF(term_vectors, terms_count)
document_count = len(category_list)
IDF = calculate_IDF(term_vectors, document_count)
TF_IDFs = calculate_TF_IDF(TFs, IDF)
for TF_IDF in TF_IDFs:
print(TF_IDF)
测试结果
最后
- 由于博主水平有限,不免有疏漏之处,欢迎读者随时批评指正,以免造成不必要的误解!