第五部分-微博热搜内容标签_微博的标签代码-CSDN博客

本文链接：https://blog.csdn.net/weixin_43596589/article/details/117032923

python微博热搜内容标签

文章只做简单记录和放出完整代码，详细内容可以一起讨论
文章需要https://blog.csdn.net/weixin_43596589/article/details/115601133
上一部分的代码

字符串匹配的中文分词

参考到前辈写的代码及提供的方法，利用词典匹配，识别热搜内容

第一步

在调用mysql存入数据库前写入以下代码

    # ······
    # 将StringList转换为NumList
    view = list(map(int, view))
    
    # 初始化关键词数组
    key_list = []
    
    Dicfile = "mySelfDict.txt"
    splitWord.InitDic(Dicfile)
    splitWord.InitStopword()
    for af in range(0, len(affair)):
        result = splitWord.WordSeg(affair[af])
        # print(affair[af])
        key_list.append(result)
        # print(result)

第二步

新建splitWord.py

import sys
import os
import re

StopWordtmp = [' ', u'\u3000', u'\x30fb', u'\u3002', u'\uff0c', u'\uff01', u'\uff1f', u'\uff1a', u'\u201c', u'\u201d',
               u'\u2018', u'\u2019', u'\uff08', u'\uff09', u'\u3010', u'\u3011', u'\uff5b', u'\uff5d', u'-', u'\uff0d',
               u'\uff5e', u'\uff3b', u'\uff3d', u'\u3014', u'\u3015', u'\uff0e', u'\uff20', u'\uffe5', u'\u2022', u'.']

WordDic = {}
StopWord = []
span = 16


def InitStopword():
    for key in StopWordtmp:
        StopWord.append(key)


def InitDic(Dicfile):
    f = open(Dicfile, mode='r', encoding="utf-8")
    for line in f:
        line = line.strip().encode('utf-8').decode('utf-8')
        WordDic[line] = 1
    f.close()
    print(len(WordDic))
    print("Dictionary has built down!")


def WordSeg(affair):
    senList = []
    tmpword = ''
    line = affair
    for i in range(len(line)):
        if line[i] in StopWord:
            senList.append(tmpword)
            senList.append(line[i])
            tmpword = ''
        else:
            tmpword += line[i]
            if i == len(line) - 1:
                senList.append(tmpword)

    newsenList = []
    for key in senList:
        if key in StopWord:
            # 无意义执行
            meaningless = 0
        else:
            tmplist = PostSenSeg(key, span)
            for keyseg in tmplist:
                newsenList.append(keyseg)

    return newsenList


def PostSenSeg(sen, span):
    cur = len(sen)
    pre = cur - span
    if pre < 0:
        pre = 0
    revlist = []
    key_word_list = []
    while 1:
        if cur <= 0:
            break
        s = re.search(
            u"[0|1|2|3|4|5|6|7|8|9|\uff11|\uff12|\uff13|\uff14|\uff15|\uff16|\uff17|\uff18|\uff19|\uff10|\u4e00"
            u"|\u4e8c|\u4e09|\u56db|\u4e94|\u516d|\u4e03|\u516b|\u4e5d|\u96f6|\u5341|\u767e|\u5343|\u4e07|\u4ebf"
            u"|\u5146|\uff2f]+$",
            sen[pre:cur])
        if s:
            if s.group() != '':
                revlist.append(s.group())
            cur = cur - len(s.group())
            pre = cur - span
            if pre < 0:
                pre = 0
        s = re.search(
            u"^[a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y"
            u"|Z|\uff41|\uff42|\uff43|\uff44|\uff45|\uff46|\uff47|\uff48|\uff49|\uff47|\uff4b|\uff4c|\uff4d|\uff4e"
            u"|\uff4f|\uff50|\uff51|\uff52|\uff53|\uff54|\uff55|\uff56|\uff57|\uff58|\uff59|\uff5a|\uff21|\uff22"
            u"|\uff23|\uff24|\uff25|\uff26|\uff27|\uff28|\uff29|\uff2a|\uff2b|\uff2c|\uff2d|\uff2e|\uff2f|\uff30"
            u"|\uff31|\uff32|\uff33|\uff35|\uff36|\uff37|\uff38|\uff39|\uff3a]+",
            sen[pre:cur])
        if s:
            if s.group() != '':
                revlist.append(s.group())
            cur = cur - len(s.group())
            pre = cur - span
            if pre < 0:
                pre = 0

        if (sen[pre:cur] in WordDic) or (cur - 1 == pre):
            if sen[pre:cur] != '':
                revlist.append(sen[pre:cur])
            # Robert Changed
            if len(sen[pre:cur]) > 1:
                key_word_list.append(sen[pre:cur])
            cur = pre
            pre = pre - span
            if pre < 0:
                pre = 0
        else:
            pre += 1
    return key_word_list