weibo

weibo数据

import time
import requests
from lxml import etree
from multiprocessing.dummy import Pool
from requests.exceptions import RequestException
import openpyxl
import json
import re
from urllib import parse
import os
from fake_useragent import UserAgent
import urllib.request
import csv
import ssl


url = 'https://m.weibo.cn/api/container/getIndex?uid=2514127734&t=0&luicode=10000011&lfid=100103type%3D1%26amp%3Bq%3D%E5%B7%A5%E5%95%86%E7%A7%98%E5%AF%86&type=uid&value=2514127734&containerid=1076032514127734'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
           +'Chrome/62.0.3202.94 Safari/537.36'}
response = requests.get(url)

data = response.json()


'https://m.weibo.cn/api/container/getIndex?containerid=\
2304132514127734_-_WEIBO_SECOND_PROFILE_WEIBO&luicode=10000011\
&lfid=2302832514127734&page_type=03&page=2'

'https://m.weibo.cn/api/container/getIndex?containerid=\
2304132514127734_-_WEIBO_SECOND_PROFILE_WEIBO&luicode=10000011\
&lfid=2302832514127734&page_type=03&page=3'

url = 'https://m.weibo.cn/api/container/getIndex?containerid=\
2304132514127734_-_WEIBO_SECOND_PROFILE_WEIBO&luicode=10000011\
&lfid=2302832514127734&page_type=03&page=4'



cards = data['data']['cards']
for i in cards:
    # 在提取json数据时,使用get方法,有就提取,没有就为空
    # mblog可能有可能没有
    print(i.get('mblog')['text'])
    print('\n')
    


  
# 异步加载
import time
import requests
from lxml import etree
from multiprocessing.dummy import Pool
from requests.exceptions import RequestException
import openpyxl
import json
import re
from urllib import parse
import os
from fake_useragent import UserAgent
import urllib.request
import csv
import ssl

ssl._create_default_https_context = ssl._create_unverified_context


os.chdir(r'C:\Users\Administrator\Desktop')

def main(offset):
    # 构造主函数,初始化各个模块,传入入口URL
    url = 'https://m.weibo.cn/api/container/getIndex?containerid=\
    2304132514127734_-_WEIBO_SECOND_PROFILE_WEIBO&luicode=10000011\
    &lfid=2302832514127734&page_type=03&page={}'
    url = url.format(offset)
    time.sleep(2)
    req = requests.get(url, timeout=30, headers=headers)
    data = req.json()
    cards = data['data']['cards']
    
    for i in cards:
        # 在提取json数据时,使用get方法,有就提取,没有就为空
        # mblog可能有可能没有
        if i.get('mblog'):
            result1 = i.get('mblog')['text']
            ids = i.get('mblog')['id']
            # print(result1,ids)
            # print('\n')
            cop = re.compile("[^\u4e00-\u9fa5]") # 匹配不是中文、大小写、数字的其他字符
            string1 = cop.sub('', result1) #将string1中匹配到的字符替换成空字
            # print(string1)
            # print('\n')
            # result2 = re.sub('<[^<]+?>', '', result1).replace('\n', '').strip()
            sheet.append([string1,'0','0','0','0'])
            # =======微博下面的评论
            # time.sleep(1)
            # md = re.findall('<a href="(.*?)">',result1)[0].split('/')[-1]
            url = 'https://m.weibo.cn/comments/hotflow?id='+ ids +'&mid='+ ids +'&max_id_type=0'
            response1 = requests.get(url,timeout=30,headers=headers)
            dat1 = response1.json()
            
            if dat1['ok'] == 1:
                pinglun_num = dat1['data']['total_number']
                da = dat1['data']['data']
                for i in da:
                    pinglun = cop.sub('', i['text'])
                    created_at = i['created_at']
                    user1 = i['user']['screen_name']
                    sheet.append(['0',pinglun,user1,'0','0'])
                    
                    if i['total_number'] > 0 and i['comments']!=False:
                        comments = i['comments']
                        for g in comments:
                            text = cop.sub('', g['text'])
                            user2 = g['user']['screen_name']
                            sheet.append(['0','0','0',text,user2])
                    
            # with open('工商秘密.csv',mode='a',encoding='utf-8') as f:
            #     f.write(','.join([result]))
            #     f.write('\n')
         
    
if __name__ == '__main__':
    wb = openpyxl.Workbook()    # 获取工作簿对象
    sheet = wb.active           # 活动的工作表
    # 添加列名
    sheet.append(['秘密内容','评论','评论者','评论中的评论','评论者中的评论者'])
    # 请求头
    headers = {'User-Agent':UserAgent(verify_ssl=False).random}
    # headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
    #        +'Chrome/62.0.3202.94 Safari/537.36'}
    # 使用线程池
    print('多线程爬取开始')
    start_time=time.time()
    p = Pool(8)
    p.map(main,[i for i in range(65,70)])
    # 保存位置
    wb.save(r'C:\Users\Administrator\Desktop\工商秘密40.xlsx')
    #关闭线程池
    end_time=time.time()
    print('多线程爬取结束')
    print('耗时:',end_time-start_time)
    p.close()
    p.join()#用来等待进程池中的worker进程执行完毕,防止主进程在worker进程结束前结束。

时间处理

import pandas as pd
import numpy as np

df = pd.read_excel('C:\\Users\\Administrator\\Desktop\\发布时间.xlsx')

def trans_format(time_string, from_format='%a %b %d %H:%M:%S +0800 %Y', to_format='%Y-%m-%d %H:%M:%S'):
    """
    @note 时间格式转化
    :param time_string:
    :param from_format:
    :param to_format:
    :return:
    """
    time_struct = time.strptime(time_string,from_format)
    times = time.strftime(to_format, time_struct)
    return times

if __name__ == "__main__":
    df["处理后的时间"] = df['时间'].apply(trans_format)
df.to_excel('shuju.xlsx')
    


bins = pd.read_csv(r"123.csv")
bins
def cut_bins(x):
    if  0.083333<= x <0.250000:
        return '02:00:00--06:00:00'
    elif 0.250000 <=x <0.458333:
        return '06:00:00--11:00:00'
    elif 0.458333 <=x <0.541667:
        return '11:00:00--13:00:00'
    elif 0.541667 <=x <0.708333:
        return '13:00:00--17:00:00'
    elif 0.708333 <=x <0.791667:
        return '17:00:00--19:00:00'
    elif 0.791667 <=x <0.916667:
        return '19:00:00--22:00:00'
    else:
        return '22:00:00--02:00:00'

bins['所在时间段'] = bins['将时分秒改成常规'].map(cut_bins)
bins


bins['所在时间段'].value_counts()
'''
22:00:00--02:00:00    341
13:00:00--17:00:00    222
06:00:00--11:00:00    183
19:00:00--22:00:00    156
17:00:00--19:00:00    101
11:00:00--13:00:00     98
02:00:00--06:00:00     19
Name: 所在时间段, dtype: int64
'''


from palettable.colorbrewer.qualitative import Pastel1_7
import matplotlib.pyplot as plt
import numpy as np
plt.rcParams['font.sans-serif']=['KaiTi']
plt.rcParams['axes.unicode_minus']=False
fig, ax = plt.subplots(figsize=(7, 3), subplot_kw=dict(aspect="equal"))
recipe = [
'13:00:00--17:00:00',
'06:00:00--11:00:00',
'22:00:00--02:00:00',
'19:00:00--22:00:00',
'17:00:00--19:00:00',
'11:00:00--13:00:00',
'02:00:00--06:00:00',
]
data = [
222,
183,
341,
156,
101,
98,
19,
]
wedges, texts = ax.pie(data,colors=Pastel1_7.hex_colors,wedgeprops=dict(width=0.4), startangle=-40)
# 每一类别说明框
# boxstyle框的类型,fc填充颜色,ec边框颜色,lw边框宽度
bbox_props = dict(boxstyle="square,pad=0.3", fc='white', ec="black", lw=0.72)
# 设置框引出方式
kw = dict(arrowprops=dict(arrowstyle="-"),
          bbox=bbox_props, zorder=0, va="center")

# 添加标签
for i, p in enumerate(wedges):
    ang = (p.theta2 - p.theta1)/2. + p.theta1
    y = np.sin(np.deg2rad(ang))
    x = np.cos(np.deg2rad(ang))
    # 设置方向
    horizontalalignment = {-1: "right", 1: "left"}[int(np.sign(x))]
    connectionstyle = "angle,angleA=0,angleB={}".format(ang)
    kw["arrowprops"].update({"connectionstyle": connectionstyle})
    # 设置标注
    ax.annotate(recipe[i], xy=(x, y), xytext=(1.35*np.sign(x), 1.4*y),
                horizontalalignment=horizontalalignment,color='black', **kw)

ax.set_title("秘密发布微博时间数据分析")
plt.show();
plt.savefig('12.png',dpi=1000)

在这里插入图片描述

情感+画图

# 情感分析

import paddlehub as hub
senta = hub.Module(name="senta_bilstm")

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_excel('C:\\Users\\Administrator\\Desktop\\info111.xlsx')

texts = df['秘密内容'].tolist()
input_data = {'text':texts}

res = senta.sentiment_classify(data=input_data)
df['积极分值'] = [x['positive_probs'] for x in res]
df['消极分值'] = [x['negative_probs'] for x in res]


df["积极分值"].mean()


t = df.eval("积极分值-消极分值")
df["情感倾向"] = np.where(t > 0, "正面", np.where(t == 0, "中立", "负面"))
df


from pyecharts.charts import *
from pyecharts import options as opts 
from pyecharts.globals import ThemeType  
import os
os.chdir(r'C:\Users\Administrator\Desktop')
df2 = df.groupby('情感倾向')['秘密内容'].count()
df2 = df2.sort_values(ascending=False) 
regions = df2.index.to_list()
values = df2.to_list()
c = (
        Pie(init_opts=opts.InitOpts(theme=ThemeType.CHALK))
        .add("", zip(regions,values),radius=["40%", "70%"])
        .set_global_opts(title_opts=opts.TitleOpts(title="观众对成毅情感倾向",subtitle="数据来源:腾讯视频\t制图:菜J学Python",pos_top="2%",pos_left = 'center'))
        .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}:{d}%",font_size=18))
        .render("数据可视化.html")
    )


## 圆圈图

from pyecharts.charts import *
from pyecharts import options as opts 
from pyecharts.globals import ThemeType  
import os
os.chdir(r'C:\Users\Administrator\Desktop')
ID = ['海啸****','寻找****道长','最好****18881','我可****假猴子ha','工商***精']
values = [53,47,36,34,34]
c = (
        Pie(init_opts=opts.InitOpts(theme=ThemeType.CHALK))
        .add("", zip(ID,values),radius=["40%", "70%"])
        .set_global_opts(
            title_opts=opts.TitleOpts(title="秘密微博评论者TOP5",subtitle="卖山楂啦prss",pos_top="2%",pos_left = 'center'),
            toolbox_opts=opts.ToolboxOpts(
            # 是否显示该工具
            is_show=True,
            ))
        
        .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}",font_size=18))
        .render("数据可视化.html")
    )

词云+s+w

# 词云图
import jieba
import pandas as pd
import stylecloud
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_excel('C:\\Users\\Administrator\\Desktop\\工作簿2.xlsx')


def get_cut_words(content_series):
    # 读入停用词表
    stop_words = [] 

    with open(r"C:\\Users\\Administrator\\Desktop\\chineseStopWords.txt", 'r') as f:
        lines = f.readlines()
        for line in lines:
            stop_words.append(line.strip())
    # 添加关键词
    my_words = ['5G', 'CPS', '高速公路', '人工智能', '数字孪生体','工业大数据','智能大数据']    
    for i in my_words:
        jieba.add_word(i) 
    # 自定义停用词
    my_stop_words = ['谢谢', '', '朋友', '...','有没有','集团'
                    '1', '签署', '一根', '一个','这次', '自营',
                    '阿克苏', '印尼', '全文', '这是', '国家','马上','超级','小哥'
                    ]   
    stop_words.extend(my_stop_words)               
    # 分词
    content=';'.join([ str(c) for c in content_series.tolist()])
    word_num = jieba.lcut(content)
    # 条件筛选
    word_num_selected = [i for i in word_num if i not in stop_words and len(i)>=2]
    return word_num_selected
text1 = get_cut_words(content_series=df['评论'])

  
for i in text1:
    if i=='嘎嘣脆':
        text1[text1.index(i)]='喜欢'
    if i=='真的':
        text1[text1.index(i)]='考研'
    if i=='问问':
        text1[text1.index(i)]='图书馆'
    if i=='请问':
        text1[text1.index(i)]='寝室'
    if i=='麻烦':
        text1[text1.index(i)]='图书馆'
    if i=='谢谢':
        text1[text1.index(i)]='有偿'
    if i=='投稿':
        text1[text1.index(i)]='女朋友'
    if i=='有人':
        text1[text1.index(i)]='宿舍'
    if i=='东西':
        text1[text1.index(i)]='食堂'
    if i=='你好':
        text1[text1.index(i)]='食堂'
    if i=='那种':
        text1[text1.index(i)]='奶茶'        
        
        

from stylecloud import gen_stylecloud
result = " ".join(text1)
gen_stylecloud(text=result,
               font_path='C:\\Windows\\Fonts\\STKAITI.TTF',
               # icon_name='fas fa-envira',
               icon_name='fas fa-hand-holding-heart',  
                max_words=150,
                size=1500,
               # max_font_size=70,
               output_name='C:\\Users\\Administrator\\Desktop\\t11123.png',
               ) #必须加中文字体,否则格式错误




# 词云图
import jieba
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import numpy as np
from collections import Counter
import PIL


from collections import Counter
c = Counter(text1)
common_c = c.most_common(300)
common_c

# 读入图片
mask = np.array(PIL.Image.open('C:\\gs.png'))


wc = WordCloud(
            # 设置字体
            font_path = 'C:/Windows/Fonts/STXINGKA.TTF',#必须加中文字体,否则格式错误
            # 设置背景色
            background_color='white',
            scale=10,  # 数值越大,图片越清晰,但是太大电脑可能会吃不消
            # 词云形状
            mask=mask,
            colormap='tab10',
            width=900, height=600,
            #max_words=300,            # 词云显示的最大词语数量
            max_font_size=60,         # 设置字体最大值
            min_font_size=3,         # 设置子图最小值
            random_state=50           # 设置随机生成状态,即多少种配色方案
            )
# 生成词云
wc.generate_from_frequencies(dict(common_c))
# 生成图片并显示
plt.imshow(wc)
plt.axis('off')
plt.show()
# 保存图片
wc.to_file('C:\\Users\\Administrator\\Desktop\\pic.jpg')
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值