山东大学舆情分析系统算法模块开发日志 2021.05.20

算法模块开发日志 2021.05.20

增加了数据清洗模块

import datetime
import pandas as pd

#title, url, source, timestamp

def data_processing(path):
    csv = pd.read_csv(path, sep = ',', usecols = [0 ,1, 2, 3, 4], header = None, encoding = 'utf-8')

    #文本清洗
    for index, row in csv.iterrows():
        for i in range(0, 5):
            if pd.isnull(row[i]):
                csv[i][index] = ''
            csv[i][index] = row[i].replace(',', ',').replace('\n', '')
        csv[2][index] = row[2].replace('?', '')
        csv[3][index] = row[3].replace('?', '')

    #时间处理
    today = datetime.date.today()
    csv_index = 0

    for item in csv[3]:
        string = item

        if '-' in string:
            index = string.find('-')
            if len(string) > 11:
                string = string[index - 4 : index + 6]
            i = string.rindex('-')
            while len(string) > i + 1 and string[i + 1].isalnum():
                i = i + 1
            if len(string) > i + 1:
                string = string[0 : i + 1]
            string = string[0 : 4] + '年' + string[5 :]
            string = string.replace('-', '月')
            string += '日'

        elif '/' in string:
            index = string.find('/')
            if len(string) > 11:
                string = string[index - 4 : index + 6]
            i = string.rindex('/')
            while len(string) > i + 1 and string[i + 1].isalnum():
                i = i + 1
            if len(string) > i + 1:
                string = string[0 : i + 1]
            string = string[0 : 4] + '年' + string[5 :]
            string = string.replace('/', '月')
            string += '日'
            
        elif '年' in string:
            index = string.find('年')
            if len(string) > 11:
                string = string[index - 4 : index + 7]
            i = len(string) - 1
            while not string[i] == '日':
                string = string[0 : i]
                i = i - 1
                if i < 4:
                    string = ''
                    break

        elif '月' in string and '日' in string:
            index1 = string.find('月')
            index2 = string.find('日')
            if index2 - index1 > 0 and index2 - index1 < 3:
                    string = str(temp.year) + '年' + string
            else:
                string =''

        elif '分钟前' in string or '小时前' in string or '今天' in string:
            string = str(today.year) + '年' + str(today.month) + '月' + str(today.day) + '日'

        elif '昨天' in string:
            temp = today - datetime.timedelta(1)
            string = str(temp.year) + '年' + str(temp.month) + '月' + str(temp.day) + '日'

        elif '前天' in string:
            temp = today - datetime.timedelta(2)
            string = str(temp.year) + '年' + str(temp.month) + '月' + str(temp.day) + '日'

        elif '天前' in string:
            index = string.find('天前')
            if string[index - 1] == '0':
                temp = today - datetime.timedelta(10)
                string = str(temp.year) + '年' + str(temp.month) + '月' + str(temp.day) + '日'
            else:
                temp = today - datetime.timedelta(int(string[index - 1]))
                string = str(temp.year) + '年' + str(temp.month) + '月' + str(temp.day) + '日'
        else:
            string = ''

        csv[3][csv_index] = string
        csv_index += 1

    #二级爬取信息处理
    data_all = []
    for index, row in csv.iterrows():
        if not pd.isnull(row[0]):
            temp = [row[0], row[1], row[2], row[3]]
            data_all.append(temp)

        if not pd.isnull(row[4]):
            detail = row[4].split('.')
            for item in detail:
                if item:
                    temp = [item, '', row[2] + '(二级)', '']
                    data_all.append(temp)

    #信息提取
    data = []
    for line in data_all:
        if line[1]:
            data.append(line)
        elif '山东大学' in line[0]:
            data.append(line)

    print(len(data_all))
    return data

def data_processing_zhihu(path):
    csv = pd.read_csv(path, sep = ',', usecols = [0 ,1, 2, 3, 4], header = None, encoding = 'utf-8')

    #文本清洗
    for index, row in csv.iterrows():
        for i in range(0, 5):
            if pd.isnull(row[i]):
                csv[i][index] = ''
            csv[i][index] = row[i].replace(',', ',').replace('\n', '')

    #内容提取
    data = []
    PingLun = []
    for index, row in csv.iterrows():
        temp = [row[0] + ',' + row[3].split('||')[0], row[1], '知乎', '']
        if not temp in data:
            data.append(temp)
            temp2 = row[4][2:].split('||')
            for item in temp2:
                PingLun.append(item)

    return data, PingLun

def data_processing_weibo(path):
    csv = pd.read_csv(path, sep = ',', usecols = [0 ,1, 2, 3, 4], header = None, encoding = 'utf-8')

    #文本清洗
    for index, row in csv.iterrows():
        for i in range(0, 5):
            if pd.isnull(row[i]):
                csv[i][index] = ''
            csv[i][index] = row[i].replace(',', ',').replace('\n', '')

    #内容提取
    data = []
    List0 = []
    PingLun = []
    for index, row in csv.iterrows():
        temp = [row[0].replace('#', ''), row[1], '微博', '']
        if not temp in data:
            data.append(temp)
            List0.append(temp[0])
        temp2 = row[4].split('#')
        for item in temp2:
            string = item.replace(' ', '').replace('\u200b', '')
            if string and not string in List0:
                PingLun.append(string)

    return data, PingLun

增加了情感分析模块

import paddlehub as ph

senta = ph.Module(name="senta_bilstm")

def get_emotion(list, emo_list):
    dic = {'positive' : emo_list[0], 'neutral': emo_list[1], 'negative' : emo_list[2]}
    input_dict = {"text": list}
    results = senta.sentiment_classify(data = input_dict)
    for index, result in enumerate(results):
        if result['positive_probs'] < 0.65 and result['positive_probs'] >= 0.35:
            dic['neutral'] += 1
        else:
            dic[result['sentiment_key']] += 1

    res = []
    for item in dic.values():
        res.append(item)

    return res

在这里插入图片描述
对分词词库进行了强化训练

  • 停用词词库:1980条 >>> 2770条
  • 无关词词库:70条 >>> 120条
  • 保留字词库:18条 >>> 78条
  • 另外额外对分词结果增加了一次清洗过程
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值