微博个人标签抓取
import gevent
import gevent.monkey
gevent.monkey.patch_all()
import requests
from lxml import etree
import re
import time
import datetime
import pandas as pd
import csv
import json
headers = {
"Cookie": "SINAGLOBAL=233870499042.07034.1544424889628; Hm_lvt_cdca62f337ad44ec441f4f40b393c2c7=1544920634,1544936104,1545021667,1545124344; _ga=GA1.2.166165776.1546848340; __gads=ID=877682d3e7e9179656,1559271999,1559523644; _s_tentry=hao.360.com; UOR=,,spr_web_360_hao360_weibo_t001; Apache=3936416404558.185.1559523650708; ULV=1559523651582:151:1:1:3936416404558.185.1559523650708:1559271476158; _gid=GA1.2.87872556.1559553371; YF-Page-G0=7f483edf167a381b771295af62b14a27|1559612139|1559612139; Hm_lpvt_96d9d92b8a4aac83bc206b6c9fb2844a=1559612141",
"Host": "weibo.com",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36",
}
#python 中把一个列表(list)平均分成n块
def func(listTemp, n):
for i in range(0, len(listTemp), n):
yield listTemp[i:i + n]
g=[]
word_pd=pd.read_csv(r"C:\Users\Lavector\Desktop\百事小红书\微博标签.csv",engine='python',header=None).values.tolist()[1:10]
print(word_pd)
print(len(word_pd))
def main(key_word):
# for l in range(7,9):
# try:
# for l in range(len(word_pd)):
# key_word=word_pd[l]
new_url="https://weibo.com/p/100505{}/info?mod=pedit_more".format(key_word)
print(key_word,new_url)
html=requests.get(url=new_url,headers=headers)
html.encoding="utf-8"
new_html=html.text
# print(new_html)
print("++++++++++++++++")
if "标签:" in new_html:
html1=re.findall('domid":"Pl_Official_PersonalInfo__57.*?"html":"(.*?)"}\)</script>',new_html,re.S)[0].replace("\\r","").replace("\\n","").replace("\\","")
print(html1)
new_html1=etree.HTML(html1)
tag=','.join(new_html1.xpath('//a[@node-type="tag"]//text()')).replace(" ","")
print(tag)
g.append([key_word,new_url,tag])
else:
pass
# except:
# with open("标签微博222.csv", "w", encoding="utf-8-sig", newline="") as f:
# k = csv.writer(f, dialect="excel")
# k.writerow(["id", "个人简介", "标签"])
#
# for list1 in g:
# k.writerow(list1)
# pass
# #
if __name__ == "__main__":
temp = func(word_pd, 10)
# print(temp[0])
print(temp)
for i in temp:
print(i)
# length = len(i)
xclist = [] # 构建协程链接池
for w in range(len(i)):
xclist.append(gevent.spawn(main, i[w][0]))
# time.sleep(1)
# gevent.joinall(xclist)
print(xclist)
gevent.joinall(xclist)
with open("标签微博111113.csv", "w", encoding="utf-8-sig", newline="") as f:
k = csv.writer(f, dialect="excel")
k.writerow(["id", "个人简介", "标签"])
for list1 in g:
k.writerow(list1)