import re, os from datetime import datetime, timedelta import csv import jieba.analyse as anls # 关键词提取 import openpyxl def clear_word(word_dict): """ remove life-always word :param word_dict: :return: """ with open(r"D:\reptile\融媒体v2\grab_data\clear_word.txt", "r", encoding="utf-8-sig") as f_r: for line in f_r: word = line.strip() if word in word_dict.keys(): del word_dict[word] return word_dict def get_col_name_index(reader, title): """ get the file of csv index :param reader: :param title: :return: """ pos = -1 for row in reader: try: pos = row.index(title) except: raise Exception('get csv title index error') break return pos def read_csv(file_path, file_name, title=None): """ read csv file :param file_path: :param file_name: :param title: :return: """ if not os.path.exists(file_path): os.makedirs(file_path) file = os.path.join(file_path, file_name) with open(file, "r", encoding="utf-8-sig") as csv_file: reader = csv.reader(csv_file) title_index = get_col_name_index(reader, title) column = [row[title_index] for row in reader] temp_list = [] for i in range(len(column)): if i < 2: # 过滤掉标题头和第一行(第一行和第二行重复) continue temp_list.append(column[i]) text = "\n".join(temp_list) return text def write_xlsx(word_dict_temp, file_path, now_time, file_name="dw_out.xlsx"): """ save data deal with before :param word_dict_temp: :param file_path: :param file_name: :return: """ if not isinstance(word_dict_temp, dict): raise Exception('word_dict_temp is not dict') if not os.path.exists(file_path): os.makedirs(file_path) xlsx_file_path = os.path.join(file_path, file_name) book = openpyxl.Workbook() sheet = book.active sheet.title = re.findall('(.+?)\.', file_name)[0] r = 1 for word, count in word_dict_temp.items(): if r == 1: sheet.cell(row=r, column=1, value='统计日期') sheet.cell(row=r, column=2, value='分词') sheet.cell(row=r, column=3, value='出现次数') else: # now_time = datetime.now().strftime("%Y%m%d %H:%M:%S") sheet.cell(row=r, column=1, value=str(now_time)) sheet.cell(row=r, column=2, value=str(word)) sheet.cell(row=r, column=3, value=str(count)) r += 1 book.save(xlsx_file_path) def deal_special_word(data_list, remv_part_list): """ deal with special word for example: 号线陈家祠站 change after 陈家祠站 :param data_list: :param remv_part_list: :return: """ temp_list = [] min_len, max_len = min([len(x) for x in remv_part_list]), max([len(x) for x in remv_part_list]) for data in data_list: pref_1 = data[:min_len] pref_2 = data[:max_len] if pref_2 in remv_part_list: data = str(data).replace(pref_2, '') elif pref_1 in remv_part_list: data = str(data).replace(pref_1, '') else: pass temp_list.append(data) return temp_list def add_word(word_dict_temp, text, dw_path): """ add special word :param word_dict_temp: :param text: :param dw_path: :return: """ if not os.path.exists(dw_path): os.makedirs(dw_path) # print(text) # seg_list = jieba.cut(text) # print("Revise:" + "/".join(seg_list)) t0 = re.findall('\d{1,2}月\d{1,2}日', text) # t1 = re.findall('\d{4}年\d{1,2}月\d{1,2}日', text) t2 = re.findall('\d{1,2}号线', text) t3 = re.findall('[\u4e00-\u9fa5]{1,5}站', text) t4 = re.findall('[\u4e00-\u9fa5]{1,2}区', text) t5 = re.findall('[\u4e00-\u9fa5]{1,5}号线', text) t6 = re.findall('[\u4e00-\u9fa5]{1,2}花市', text) t7 = re.findall('[\u4e00-\u9fa5]{0,5}\d{1,3}路', text) t8 = re.findall('[\u4e00-\u9fa5]{1,6}公园', text) t9 = re.findall('[\u4e00-\u9fa5]{2,4}指南', text) t3 = deal_special_word(t3, ['号线', '线至', '等到', '至', '到', '线']) word_dict = {} word_list = t0 + t2 + t3 + t4 + t5 + t6 + t7 + t8 + t9 for word in word_list: word_dict[word] = text.count(word) word_dict_list = sorted(word_dict.items(), key=lambda x: x[1], reverse=True) add_file = os.path.join(dw_path, "add_word.txt") with open(add_file, "w", encoding="utf-8-sig") as f_w: for k, v in word_dict_list: f_w.write(k + "\n") f_w.close() word_dict_temp.update({word: count for word, count in word_dict_list[:300]}) word_dict_add = {} for word, count in word_dict_temp.items(): if len(word) > 1: word_dict_add[word] = count return word_dict_add def word_count(text, dm_path, dm_file_name, dw_path, now_time, dw_file_name="dw_out.xlsx"): """ word count :param text: :param dm_path: :param dm_file_name: :param dw_path: :param dw_file_name: :return: """ if not os.path.exists(dm_path): os.makedirs(dm_path) word_dict_temp = {} for x, w in anls.textrank(text, withWeight=True, topK=500): word_dict_temp[x] = text.count(x) # 补充新词 word_dict_temp = add_word(word_dict_temp, text, dw_path) word_dict_list = sorted(word_dict_temp.items(), key=lambda x: x[1], reverse=True) # count数降序 word_dict_temp = dict(word_dict_list) write_xlsx(word_dict_temp, dw_path, dw_file_name) # 预处理前保存数据 word_dict_temp = clear_word(word_dict_temp) # 过滤掉常用词语 word_dict = {word: count for word, count in list(word_dict_temp.items())[:200]} # 保存前100热词 print("广州花市热词前100统计输出为:" + str(word_dict)) write_xlsx(word_dict, dm_path, now_time, dm_file_name) if __name__ == '__main__': now_time = (datetime.now() - timedelta(days=2)).strftime('%Y%m%d') # now_time = "20191230" ods_file_name = "flower_fair_{time}.csv".format(time=now_time) ods_path = r"D:\baidu\data\crawler_data\ods\flower_fair" dw_path = r"D:\baidu\data\crawler_data\dw\flower_fair" dm_path = r"\baidu\data\crawler_data\dm\flower_fair" dw_file_name = "flower_fair_{time}_dw.xlsx".format(time=now_time) dm_file_name = "flower_fair_{time}.xlsx".format(time=now_time) title = "正文" text = read_csv(ods_path, ods_file_name, title) word_count(text, dm_path, dm_file_name, dw_path, now_time, dw_file_name)
花市分词统计
最新推荐文章于 2024-10-13 23:28:20 发布