import pandas as pd
import pymysql
from sqlalchemy import create_engine
from collections import Counter
import jieba.analyse
import jieba
import time
import pynlpir
import datetime
def getYesterday():
today = datetime.date.today()
oneday = datetime.timedelta(days=1)
yesterday = today - oneday
return yesterday
def main():
engine = create_engine(
"mysql+pymysql://public:123456@192.168.0.22:3307/toujidao?charset=utf8")
sql = r'select title ,datetime from toujidao where datetime = "{0} 00:00:00"'.format(
getYesterday())
df = pd.read_sql(sql, engine, index_col='datetime')
lists = df.values
list_cut = []
for list in range(len(lists)):
list = lists[list][0]
#data = jieba.cut_for_search(str(list))
data = jieba.cut(str(list))
for s in data:
list_cut.append(s)
count = dict(Counter(list_cut))
df_dict = pd.DataFrame.from_dict(count, orient='index')
df = df_dict.sort_values(ascending=False, by=[0])
df.to_excel(
r'/home/companyshare/IT部/toujidao/toujidao_{0}.xls'.format(getYesterday()))
# 使用pynlpir 进行分词
# pynlpir.open()
# for list in range(len(lists)):
# list = lists[list][0]
# segment = pynlpir.segment(list)
# for s in segment:
# list_cut.append(s[0])
# pynlpir.close()
# count = dict(Counter(list_cut))
# df_dict = pd.DataFrame.from_dict(count,orient='index')
# df = df_dict.sort_values(ascending=False,by=[0])
# df.to_excel('/home/'.format(getYesterday()))
if __name__ == '__main__':
main()
使用jieba对词语进行分词并进行统计
最新推荐文章于 2024-01-16 17:37:30 发布