结巴分词 知乎

Scrapy Pipeline

from itemadapter import ItemAdapter
import pymongo
from spider.settings import MONGODB_PORT
from spider.settings import MONGODB_HOST
from spider.settings import MONGODB_DBNAME
from spider.settings import MONGODB_Q_SHEET_NAME
from spider.settings import MONGODB_A_SHEET_NAME
from spider.items import QuestionItem
from spider.items import AnswerItem


class QuestionPipeline(object):

    def __init__(self):
        host = MONGODB_HOST
        port = MONGODB_PORT
        dbname = MONGODB_DBNAME
        client = pymongo.MongoClient(host=host, port=port)
        self.my_db = client[dbname]

    def process_question(self, item):
        sheet_name = MONGODB_Q_SHEET_NAME
        self.sheet = self.my_db[sheet_name]
        data = dict(item)
        self.sheet.insert_one(data)
        return item

    def process_answer(self, item):
        sheet_name = MONGODB_A_SHEET_NAME
        self.sheet = self.my_db[sheet_name]
        data = dict(item)
        self.sheet.insert_one(data)
        return item

    def process_item(self, item, spider):
        if isinstance(item, QuestionItem):
            self.process_question(item)
        else:
            self.process_answer(item)

JIEBA 百度停词

import json

import jieba
import pymongo
from collections import Counter

MONGODB_HOST = "127.0.0.1"
MONGODB_PORT = 27017
MONGODB_DBNAME = "zhihu"
MONGODB_A_SHEET_NAME = "answer"
MONGODB_S_SHEET_NAME = "separate"


# 计算string的分词
def cal(string):
    # 停用词集合
    stop_words = readFile('baidu_stopwords.txt')
    # 分词,同时剔除停用词
    word_list = [word for word in jieba.cut(string, cut_all=False) if word not in stop_words]
    dic_words = dict(Counter(word_list))
    return dic_words


def cal_separate(string):
    # 停用词集合
    stop_words = readFile('baidu_stopwords.txt')

    # 分词,同时剔除停用词
    word_list = []
    word_list = [word for word in jieba.cut(string, cut_all=False) if word not in stop_words]
    # print(word_list)
    return word_list


# 从mongoDB中读取数据
def readMongoDB():
    host = MONGODB_HOST
    port = MONGODB_PORT
    dbname = MONGODB_DBNAME
    client = pymongo.MongoClient(host=host, port=port)
    my_db = client[dbname]
    sheet = my_db[MONGODB_A_SHEET_NAME]
    result = []
    for item in sheet.find():
        result.append(item)
    return result


# 读取文件
def readFile(file_name):
    fp = open(file_name, "r", encoding="utf-8")
    content_lines = fp.readlines()
    fp.close()
    # 去除行末的换行符,否则会在停用词匹配的过程中产生干扰
    for i in range(len(content_lines)):
        content_lines[i] = content_lines[i].rstrip("\n")
    return content_lines


# 答案按月份分类
def separate_month(questionList):
    result = {}
    for question in questionList:
        created_time = question['created_time']
        data = created_time.split(" ")[0]
        m = data.split("-")[1]
        if m in result:
            temp = result[data.split("-")[1]]
            temp += question['content']
            result[m] = temp
        else:
            result[m] = question['content']
    return result


# 答案按天分类
def separate_day(questionList):
    result = {}
    for question in questionList:
        created_time = question['created_time'].split(" ")[0]
        if created_time in result:
            temp = result[created_time]
            temp += question['content']
            result[created_time] = temp
        else:
            result[created_time] = question['content']
    return result


# 将分词结果存入mongodb
def storeMongoDB(data):
    host = MONGODB_HOST
    port = MONGODB_PORT
    dbname = MONGODB_DBNAME
    client = pymongo.MongoClient(host=host, port=port)
    my_db = client[dbname]
    sheet = my_db[MONGODB_S_SHEET_NAME]
    sheet.insert_one(data)


def deal_separate(type):
    if type == "month":
        dic = separate_month(readMongoDB())
    else:
        dic = separate_day(readMongoDB())
    total_data = {}
    for key in sorted(dic):
        print(key)
        print("-------")
        word_list = cal_separate((dic[key]))
        print(word_list)
        total_data[key] = "|".join(word_list)
    print(total_data)
    j = json.dumps(total_data, ensure_ascii=False, indent=5)
    with open("separateWord_" + type + ".json", 'w', encoding="utf-8") as f:
        f.write(j)
    return total_data


def deal_count(type):
    if type == "month":
        dic = separate_month(readMongoDB())
    else:
        dic = separate_day(readMongoDB())

    total_data = {}
    for key in sorted(dic):
        sorted_words = sorted(cal(dic[key]).items(), key=lambda d: d[1], reverse=True)
        data = {}
        db_data = {}
        for word in sorted_words[:100]:
            data[word[0]] = word[1]
            print(word[0], ":", word[1])
        total_data[key] = data
        storeMongoDB(db_data)
    print(total_data)
    j = json.dumps(total_data, ensure_ascii=False, indent=5)
    with open("countWord" + type + '.json', "w", encoding='utf-8') as output_file:
        output_file.write(j)
        print("加载数据完成...")


if __name__ == "__main__":
    deal_separate("day")
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值