python3 - 使用 jieba3k 对直播平台房间标题进行分词

python3 安装jieba:


pip3 install jieba


或者,先下载 http://pypi.python.org/pypi/jieba/ ,解压后运行 python setup.py install


参考:https://github.com/fxsjy/jieba


实例:


得到标签和创建mydict


import requests

from pyquery import PyQuery as pq
from db import MongoClient
from config import MY_DICT

db = MongoClient()

def get_label(url):
    r = requests.get(url)
    r.encoding = 'utf-8' # 通过r.encoding设置页面编码
    doc = pq(r.text)
    table = doc.find('body > div.body-wrapper > div.content-wrapper > div > div.main-content > table:nth-child(154) > tr').items()
    id = 0
    for tr in table:
        if tr.find('td'): # 去掉th
            '''
            页面table有问题,单独修改一下
            db.getCollection('hero').update(
                // query 
                {
                    "id" : 2
                },
                
                // update 
                { '$set' : {'hero_name' : '幻翎', 'hero_name_list' : ['幻翎', '洛'], "join_time" : "2017年4月18日"}
                },
                
                // options 
                {
                    "multi" : false,  // update only one document 
                    "upsert" : false  // insert a new document, if no existing document match the query 
                }
            );
            '''
            id += 1
            hero_name = tr.find('td:nth-child(2)').text().strip()
            hero_name_list = []
            hero_name_list.append(tr.find('td:nth-child(2)').text().strip())
            hero_name_list.append(tr.find('td:nth-child(3)').text().strip())
            join_time = tr.find('td:nth-child(6)').text().strip()
            msg = {
                'id' : id,
                'hero_name' : hero_name,
                'hero_name_list' : hero_name_list,
                'join_time' : join_time
            }
            db.save(msg)

def make_mydict():
    with open(MY_DICT, mode='w', encoding='utf-8') as f:
        for name in db.get_hero_name_list():
            print(name, file=f) # 直接换行

if __name__ == '__main__':
    # get_label('http://baike.baidu.com/item/英雄联盟/4615671#4')
    make_mydict() # 创建词典
    print('ok...')


分词器


import jieba
import jieba.analyse

from db import MongoClient
from config import MY_DICT

class Tokenizer(object):
    def __init__(self):
        self._db = MongoClient()
        # 载入自己的词库
        jieba.load_userdict(MY_DICT)

    def get_hero_list(self):
        hero_list = []
        with open(MY_DICT, mode='r', encoding='utf-8') as f:
            for hero in f:
                hero_list.append(hero.strip())
        return hero_list

    def participle(self):
        hero_list = self.get_hero_list()
        print('/'.join(hero_list))
        for room in self._db.get_rooms():
            # 分词 [默认精确]
            msg = jieba.lcut(room['r_name'])
            label_list = set([w for w in msg if w in hero_list]) # 去重复
            self._db.set_label(query={'r_id' : room['r_id']},
                               data={'$set' : {'r_label' : list(label_list)}})
            print(msg, label_list)

if __name__ == '__main__':
    # 分词器
    tokenizer = Tokenizer()
    tokenizer.participle()

db


import pymongo

from config import *

class MongoClient(object):
    def __init__(self):
        self._client = pymongo.MongoClient(MONGO_URL)

    def get_rooms(self):
        db = self._client[MONGO_DB]
        for room in db[MONGO_TABLE].find(): # 去掉limit
            yield {
                'r_id' : room['r_id'],
                'r_name' : room['r_name']
            }
    def set_label(self, **kwargs):
        self._client[MONGO_DB][MONGO_TABLE].\
           update(kwargs['query'], kwargs['data'], upsert=False)

    def save(self, msg):
        try:
            self._client[MONGO_DB][MONGO_HERO_NAME].insert(msg)
        except Exception as e:
            print("e: ", e)
    def get_hero_name_list(self):
        for hero_name in self._client[MONGO_DB][MONGO_HERO_NAME].find():
            for name in hero_name['hero_name_list']:
                yield name




问题:


1. 运行的文件名和import xxx 的包名重复


import jieba
jieba.cut("我来到北京清华大学")

AttributeError: 'module' object has no attribute 'cut'

不要将运行的文件名命名为jieba.py,自己撸自己当然出错了


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

PeersLee

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值