花市分词统计

import re, os
from datetime import datetime, timedelta
import csv
import jieba.analyse as anls  # 关键词提取
import openpyxl


def clear_word(word_dict):
    """
    remove life-always word
    :param word_dict:
    :return:
    """
    with open(r"D:\reptile\融媒体v2\grab_data\clear_word.txt", "r", encoding="utf-8-sig") as f_r:
        for line in f_r:
            word = line.strip()
            if word in word_dict.keys():
                del word_dict[word]
    return word_dict


def get_col_name_index(reader, title):
    """
    get the file of csv index
    :param reader:
    :param title:
    :return:
    """
    pos = -1
    for row in reader:
        try:
            pos = row.index(title)
        except:
            raise Exception('get csv title index error')
        break
    return pos


def read_csv(file_path, file_name, title=None):
    """
    read csv file
    :param file_path:
    :param file_name:
    :param title:
    :return:
    """
    if not os.path.exists(file_path):
        os.makedirs(file_path)
    file = os.path.join(file_path, file_name)
    with open(file, "r", encoding="utf-8-sig") as csv_file:
        reader = csv.reader(csv_file)
        title_index = get_col_name_index(reader, title)
        column = [row[title_index] for row in reader]
        temp_list = []
        for i in range(len(column)):
            if i < 2:  # 过滤掉标题头和第一行(第一行和第二行重复)
                continue
            temp_list.append(column[i])
        text = "\n".join(temp_list)
        return text


def write_xlsx(word_dict_temp, file_path, now_time, file_name="dw_out.xlsx"):
    """
    save data deal with before
    :param word_dict_temp:
    :param file_path:
    :param file_name:
    :return:
    """
    if not isinstance(word_dict_temp, dict):
        raise Exception('word_dict_temp is not dict')
    if not os.path.exists(file_path):
        os.makedirs(file_path)
    xlsx_file_path = os.path.join(file_path, file_name)
    book = openpyxl.Workbook()
    sheet = book.active
    sheet.title = re.findall('(.+?)\.', file_name)[0]
    r = 1
    for word, count in word_dict_temp.items():
        if r == 1:
            sheet.cell(row=r, column=1, value='统计日期')
            sheet.cell(row=r, column=2, value='分词')
            sheet.cell(row=r, column=3, value='出现次数')
        else:
            # now_time = datetime.now().strftime("%Y%m%d %H:%M:%S")
            sheet.cell(row=r, column=1, value=str(now_time))
            sheet.cell(row=r, column=2, value=str(word))
            sheet.cell(row=r, column=3, value=str(count))
        r += 1
    book.save(xlsx_file_path)


def deal_special_word(data_list, remv_part_list):
    """
    deal with special word
    for example:
        号线陈家祠站
    change after
        陈家祠站
    :param data_list:
    :param remv_part_list:
    :return:
    """
    temp_list = []
    min_len, max_len = min([len(x) for x in remv_part_list]), max([len(x) for x in remv_part_list])
    for data in data_list:
        pref_1 = data[:min_len]
        pref_2 = data[:max_len]
        if pref_2 in remv_part_list:
            data = str(data).replace(pref_2, '')
        elif pref_1 in remv_part_list:
            data = str(data).replace(pref_1, '')
        else:
            pass
        temp_list.append(data)
    return temp_list


def add_word(word_dict_temp, text, dw_path):
    """
    add special word
    :param word_dict_temp:
    :param text:
    :param dw_path:
    :return:
    """
    if not os.path.exists(dw_path):
        os.makedirs(dw_path)
    # print(text)
    # seg_list = jieba.cut(text)
    # print("Revise:" + "/".join(seg_list))
    t0 = re.findall('\d{1,2}月\d{1,2}日', text)
    # t1 = re.findall('\d{4}年\d{1,2}月\d{1,2}日', text)
    t2 = re.findall('\d{1,2}号线', text)
    t3 = re.findall('[\u4e00-\u9fa5]{1,5}站', text)
    t4 = re.findall('[\u4e00-\u9fa5]{1,2}区', text)
    t5 = re.findall('[\u4e00-\u9fa5]{1,5}号线', text)
    t6 = re.findall('[\u4e00-\u9fa5]{1,2}花市', text)
    t7 = re.findall('[\u4e00-\u9fa5]{0,5}\d{1,3}路', text)
    t8 = re.findall('[\u4e00-\u9fa5]{1,6}公园', text)
    t9 = re.findall('[\u4e00-\u9fa5]{2,4}指南', text)
    t3 = deal_special_word(t3, ['号线', '线至', '等到', '至', '到', '线'])
    word_dict = {}
    word_list = t0 + t2 + t3 + t4 + t5 + t6 + t7 + t8 + t9
    for word in word_list:
        word_dict[word] = text.count(word)
    word_dict_list = sorted(word_dict.items(), key=lambda x: x[1], reverse=True)
    add_file = os.path.join(dw_path, "add_word.txt")
    with open(add_file, "w", encoding="utf-8-sig") as f_w:
        for k, v in word_dict_list:
            f_w.write(k + "\n")
    f_w.close()
    word_dict_temp.update({word: count for word, count in word_dict_list[:300]})
    word_dict_add = {}
    for word, count in word_dict_temp.items():
        if len(word) > 1:
            word_dict_add[word] = count

    return word_dict_add


def word_count(text, dm_path, dm_file_name, dw_path, now_time, dw_file_name="dw_out.xlsx"):
    """
    word count
    :param text:
    :param dm_path:
    :param dm_file_name:
    :param dw_path:
    :param dw_file_name:
    :return:
    """
    if not os.path.exists(dm_path):
        os.makedirs(dm_path)
    word_dict_temp = {}
    for x, w in anls.textrank(text, withWeight=True, topK=500):
        word_dict_temp[x] = text.count(x)
    # 补充新词
    word_dict_temp = add_word(word_dict_temp, text, dw_path)
    word_dict_list = sorted(word_dict_temp.items(), key=lambda x: x[1], reverse=True)  # count数降序
    word_dict_temp = dict(word_dict_list)
    write_xlsx(word_dict_temp, dw_path, dw_file_name)  # 预处理前保存数据
    word_dict_temp = clear_word(word_dict_temp)  # 过滤掉常用词语
    word_dict = {word: count for word, count in
                 list(word_dict_temp.items())[:200]}  # 保存前100热词
    print("广州花市热词前100统计输出为:" + str(word_dict))
    write_xlsx(word_dict, dm_path, now_time, dm_file_name)


if __name__ == '__main__':
    now_time = (datetime.now() - timedelta(days=2)).strftime('%Y%m%d')
    # now_time = "20191230"
    ods_file_name = "flower_fair_{time}.csv".format(time=now_time)
    ods_path = r"D:\baidu\data\crawler_data\ods\flower_fair"
    dw_path = r"D:\baidu\data\crawler_data\dw\flower_fair"
    dm_path = r"\baidu\data\crawler_data\dm\flower_fair"
    dw_file_name = "flower_fair_{time}_dw.xlsx".format(time=now_time)
    dm_file_name = "flower_fair_{time}.xlsx".format(time=now_time)
    title = "正文"
    text = read_csv(ods_path, ods_file_name, title)
    word_count(text, dm_path, dm_file_name, dw_path, now_time, dw_file_name)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值