python爬取微博评论_Python抓取微博评论

#-*- coding: utf-8 -*-

importreimporturllibimporturllib2importosimportstatimportitertoolsimportreimportsysimportrequestsimportjsonimporttimeimportsocketimporturlparseimportcsvimportrandomfrom datetime importdatetime, timedeltaimportlxml.htmlfrom wordcloud importWordCloudimportjiebaimportPILimportmatplotlib.pyplot as pltimportnumpy as npfrom zipfile importZipFilefrom StringIO importStringIOfrom downloader importDownloaderfrom bs4 importBeautifulSoupfrom HTMLParser importHTMLParserfrom itertools importproductimportsys

reload(sys)

sys.setdefaultencoding('utf8')importjson,urllib2

textmod={"uid":".....","luicode":"10000011","lfid":"100103type=3&q=张杰","featurecode":"20000180","type":"uid","value":"....","containerid":"....."}

textmod=json.dumps(textmod)

header_dict= {'Connection':'keep-alive','Cookie':'','Accept-Language':'zh-CN,zh;q=0.8','Host':'m.weibo.cn','Referer':'............','User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36','X-Requested-With':'XMLHttpRequest'}defwordcloudplot(txt):

path= 'C:\Users\Administrator\Downloads\msyh.ttf'path= unicode(path, 'utf8').encode('gb18030')

alice_mask= np.array(PIL.Image.open('E:\\aa.jpg'))

wordcloud= WordCloud(font_path=path,

background_color="white",

margin=5, width=1800, height=800, mask=alice_mask, max_words=2000, max_font_size=60,

random_state=42)

wordcloud=wordcloud.generate(txt)

wordcloud.to_file('E:\\aa1.jpg')

plt.imshow(wordcloud)

plt.axis("off")

plt.show()defmain():

a=[]

f= open(r'E:\commentqq.txt', 'r').read()

words=list(jieba.cut(f))for word inwords:if len(word) > 1:

a.append(word)

txt= r' '.join(a)

wordcloudplot(txt)defget_comment(que):

f= open('E:\commentqq.txt', 'w')for each inque:for i in range(1,1000):

textmood= {"id": each,"page": i}

textmood=json.dumps(textmood)

uu= 'https://m.weibo.cn/status/' +str(each)

header= {'Connection': 'keep-alive','Cookie': '.....','Accept-Language': 'zh-CN,zh;q=0.8','Host': 'm.weibo.cn','Referer':uu,'User-Agent': '......','X-Requested-With': 'XMLHttpRequest'}

url= 'https://m.weibo.cn/api/comments/show?id=%s&page=%s'%(str(each),str(i))printurl#f.write(url)

req= urllib2.Request(url=url, data=textmood, headers=header)

res=urllib2.urlopen(req)

res=res.read()

contents=res

d= json.loads(contents, encoding="utf-8")if 'data' ind:

data= d['data']if data != "":for each_one indata:if each_one != "":if each_one['text'] != "":

mm= each_one['text'].split('<')if r'回复' not inmm[0]:

index= mm[0]#filter(lambda x: x not in '0123456789', mm[0])

printindex#index = index.decode("gbk")

f.write(index.encode("u8"))defget_identified():

que=[]

url= 'https://m.weibo.cn/api/container/getIndex?uid=1241148864&luicode=10000011&lfid=100103type%3D3%26q%3D%E5%BC%A0%E6%9D%B0&featurecode=20000180&type=uid&value=1241148864&containerid=1076031241148864'

for i in range(1,10):if i > 1:

url= 'https://m.weibo.cn/api/container/getIndex?uid=1241148864&luicode=10000011&lfid=100103type%3D3%26q%3D%E5%BC%A0%E6%9D%B0&featurecode=20000180&type=uid&value=1241148864&containerid=1076031241148864&page='+str(i)printurl

req= urllib2.Request(url=url, data=textmod, headers=header_dict)

res=urllib2.urlopen(req)

res=res.read()

content=res

d= json.loads(content, encoding="utf-8")

data= d['cards']if data != "":for each indata:print each['itemid']

mm= each['itemid']if mm != "":

identity= mm.split('-')

num= identity[1][1:]

que.append(num)#fd.write(num)

#fd.write('\n\n')

printnum

get_comment(que)if __name__ == '__main__':

get_identified()

main()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值