Python计算豆瓣热门电影分类TF-IDF

10 篇文章 0 订阅
7 篇文章 0 订阅

Python计算豆瓣热门电影分类TF-IDF

环境

PyCharm 2020.2.4 (Professional Edition)
bs4==0.0.1
urllib3==1.24.2
MongoDB 3.4.10
Python 3.7.4
pymongo 3.11.3

TF

词频(Term Frequency, TF)反映语料库中的词条 t e r m i term_i termi在文档 d o c u m e n t j document_j documentj中出现的频率

T F i , j = c o u n t i t o t a l _ c o u n t TF_{i,j} = \frac{count_i}{total\_count} TFi,j=total_countcounti

def calculate_TF(term_vectors, terms_count):
    TFs = []
    for term_vector in term_vectors:
        TF = {}
        for term, count in term_vector.items():
            TF[term] = count / terms_count
        TFs.append(TF)

    return TFs

IDF

逆文本频率(Inverse Document Frequency, IDF)反映含某词条 t e r m i term_i termi的文档 d o c u m e n t i document_i documenti在语料库中的比重,比重越小IDF值越大,因为比重越小越能够在语料库中区分某个文档。 d o c u m e n t _ c o u n t i document\_count_i document_counti表示包含词条 t e r m i term_i termi的文档数量, d o c u m e n t t o t a l c o u n t document_total_count documenttotalcount表示语料库中文档总数。

I D F i , j = log ⁡ 10 d o c u m e n t _ c o u n t i d o c u m e n t _ t o t a l _ c o u n t IDF_{i,j} = \log_{10} \frac{document\_count_i}{document\_total\_count} IDFi,j=log10document_total_countdocument_counti

def calculate_IDF(documents, document_count):
    idf = dict.fromkeys(documents[0], 0)

    for document in documents:
        for term, count in document.items():
            if count > 0:
                idf[term] += 1

    for term, count in idf.items():
        idf[term] = math.log10((document_count + 1) / (count + 1))

    return idf

TF-IDF

TF-IDF是一种用于信息检索与数据挖掘的常用加权技术,用以评估一个词条对整个语料库中其中一份文档的重要程度,词条的重要性与其出现在文档中频率成正比(TF),与其在语料库中所有文档中出现的频率成反比(IDF

T F _ I D F i , j = t f i , j × i d f i TF\_IDF_{i,j} = tf_{i,j} \times idf_{i} TF_IDFi,j=tfi,j×idfi

def calculate_TF_IDF(TFs, IDF):
    TF_IDFs = []
    for TF in TFs:
        TF_IDF = {}
        for term, frequency in TF.items():
            TF_IDF[term] = frequency * IDF[term]
        TF_IDFs.append(TF_IDF)

    return TF_IDFs

实现代码

获取数据

import urllib
from bs4 import BeautifulSoup
import re
import pymongo


def crawl():
    url = "https://movie.douban.com/top250?start="
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"}

    image_url_pattern = re.compile(r'<img .*src="(.*?)".*>')
    title_pattern = re.compile(r'<span class="title">(.*?)</span>')
    information_pattern = re.compile(r'<p class="">(.*?)</p>', re.S)
    rating_pattern = re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>')
    evaluator_pattern = re.compile(r'<span>(.*?)人评价</span>')
    synopsis_pattern = re.compile(r'<span class="inq">(.*?)</span>')

    movies = []

    for i in range(0, 10):
        request = urllib.request.Request(url=url + str(25 * i), headers=headers)
        response = urllib.request.urlopen(request)
        html = response.read().decode('utf-8')

        beautiful_soup = BeautifulSoup(html, "html.parser")
        for item in beautiful_soup.find_all('div', class_='item'):
            movie = {}
            item = str(item)
            image_url = re.findall(image_url_pattern, item)[0]
            movie["image_url"] = image_url
            title = re.findall(title_pattern, item)[0]
            movie["title"] = title
            information = re.findall(information_pattern, item)
            information_list = information[0].strip().replace('\xa0', ',').split(',')
            movie["country"] = information_list[-3]
            movie["category"] = information_list[-1]
            rating = re.findall(rating_pattern, item)[0]
            movie["rating"] = rating
            evaluator = re.findall(evaluator_pattern, item)[0]
            movie["evaluator"] = evaluator
            synopsis = re.findall(synopsis_pattern, item)

            if len(synopsis) > 0:
                movie["synopsis"] = synopsis[0]
            else:
                movie["synopsis"] = "None"

            movies.append(movie)

    return movies


def save_in_mongodb(uri, database_name, collection_name, data):
    mongo_client = pymongo.MongoClient(uri)
    database = mongo_client[database_name]
    collection = database[collection_name]
    collection.insert_many(data)


if __name__ == "__main__":
    movies = crawl()
    save_in_mongodb('mongodb://localhost:27017', 'douban', 'movies', movies)

TF-IDF

import math
import pymongo


def query_from_mongodb(uri, database_name, collection_name):
    mongo_client = pymongo.MongoClient(uri)
    database = mongo_client[database_name]
    collection = database[collection_name]
    data = collection.find()
    return data


def calculate_TF(term_vectors, terms_count):
    # tf_vector = {}
    # for term, count in term_vector.items():
    #     tf_vector[term] = count / terms_count
    #
    # return tf_vector

    TFs = []
    for term_vector in term_vectors:
        TF = {}
        for term, count in term_vector.items():
            TF[term] = count / terms_count
        TFs.append(TF)

    return TFs


def calculate_IDF(documents, document_count):
    idf = dict.fromkeys(documents[0], 0)

    for document in documents:
        for term, count in document.items():
            if count > 0:
                idf[term] += 1

    for term, count in idf.items():
        idf[term] = math.log10((document_count + 1) / (count + 1))

    return idf


def calculate_TF_IDF(TFs, IDF):
    TF_IDFs = []
    for TF in TFs:
        TF_IDF = {}
        for term, frequency in TF.items():
            TF_IDF[term] = frequency * IDF[term]
        TF_IDFs.append(TF_IDF)

    return TF_IDFs


if __name__ == '__main__':
    movies = query_from_mongodb('mongodb://localhost:27017', 'douban', 'movies')
    category_list = []
    for movie in movies:
        category_list.append(movie['category'])

    category_terms_list = []
    term_set = set([])
    for category in category_list:
        category_terms = category.split(" ")
        category_terms_list.append(category_terms)

        term_set = term_set.union(set(category_terms))

    term_vectors = []

    for category_terms in category_terms_list:
        term_vector = dict.fromkeys(term_set, 0)
        for term in category_terms:
            term_vector[term] += 1
        term_vectors.append(term_vector)
    
    terms_count = len(term_set)
    
    TFs = calculate_TF(term_vectors, terms_count)

    document_count = len(category_list)
    IDF = calculate_IDF(term_vectors, document_count)

    TF_IDFs = calculate_TF_IDF(TFs, IDF)

    for TF_IDF in TF_IDFs:
        print(TF_IDF)

测试结果

最后

  • 由于博主水平有限,不免有疏漏之处,欢迎读者随时批评指正,以免造成不必要的误解!
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值