没有登录的版本,爬的页面数量有限
主要是处理json的顺序,做一个记录
from lxml import etree
class Weibo(object):
def __init__(self):
self.headers = {
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36",
"Cookie": "SINAGLOBAL=7599228186083.695.1499079704863; UM_distinctid=1650e0b9fd5fe-0be0aef851589a-737356c-100200-1650e0b9fd6460; login_sid_t=528fe9d40502191efa7a39c3570e2648; cross_origin_proto=SSL; TC-Ugrow-G0=e66b2e50a7e7f417f6cc12eec600f517; TC-V5-G0=ac3bb62966dad84dafa780689a4f7fc3; _s_tentry=www.baidu.com; wb_view_log=1366*7681; Apache=8910395867108.623.1535108015266; ULV=1535108015274:87:4:1:8910395867108.623.1535108015266:1534297139876; SCF=AuQUwf63PoPCuREHCy74Ls6fWNyx_TM0sEp-kb67np3Z2aYZUSXa1-yiaae_ba4wR3vkr3Wl7B8DHybqaQJXt24.; SUHB=0V5V55cALO8NM_; un=13913292465; wb_view_log_6108068337=1366*7681; TC-Page-G0=0cd4658437f38175b9211f1336161d7d; SUB=_2AkMsI2sVdcPxrAVQmPAVymPhb4pH-jyf9gLjAn7uJhMyAxh77lA3qSVutBF-XA8geeqirJoe-l4iNQQ2DTjSG_iC; SUBP=0033WrSXqPxfM72wWs9jqgMF55529P9D9WWYAY6v3pepGn0LLaK2PXs85JpVF0201K20eo.EShqf; UOR=www.baidu.com,vdisk.weibo.com,login.sina.com.cn; WBStorage=e8781eb7dee3fd7f|undefined",
# "Host":"weibo.com",
}
self.proxies = {"https": "https://192.168.43.1:1800"}
self.s = requests.Session()
self.page_num = 1
def get_url(self):
return [
"https://weibo.com/a/aj/transform/loadingmoreunlogin?" \
"ajwvr=6&category=0&page={}&lefnav=0&cursor=".format(i)
for i in range(20)]
def parse_url(self, url):
response = self.s.get(url, proxies=self.proxies, headers=self.headers).content.decode()
# with open('c.txt','w') as f :
# f.write(response)
content = json.loads(response)
content=content['data']
# content = json.dumps(content, indent=4, ensure_ascii=False)
content=content.replace('\n','')
content=content.replace('\r','')
content=content.replace('\\','')
content=content.replace('\u200b','')
content=content.replace('\xa0','')
# print('content:',content)
# return content
# def xpath_content(self,content):
try:
content=etree.HTML(content)
except Exception as e:
return None
msgs = content.xpath('//div[@class="UG_list_a"]|//div[@class="UG_list_b"] | //div[@class="UG_list_v2"] ')
# msgs = content.xpath('//ul[contains(@class,"pt_ul clearfix")]/div')
print(url)
print(len(msgs))
data = []
for msg in msgs:
item = {}
item['content'] = msg.xpath(
'.//div/div[2]/h3/div/text() | .//div/h3/div/text()')
print(item['content'])
item['time'] = msg.xpath(
'.//div/div[2]/div/span[1]/text() |.//div/span[1]/text()')[0]
item['author'] = msg.xpath(
'.//div[2]/div[1]/a[2]/span/text()| .//div[2]/a[2]/span/text()')[0]
item['forward'] = msg.xpath(
'.//span/em[@class="W_ficon ficon_forward S_ficon W_f16"]/following-sibling::em[1]/text()')[0]
item['comment'] = msg.xpath(
'.//em[@class="W_ficon ficon_repeat S_ficon W_f16"]/following-sibling::em[1]/text()')[0]
item['praised'] = msg.xpath(
'.//span/em[@class="W_ficon ficon_praised S_ficon W_f16"]/following-sibling::em[1]/text()')[0]
item['img_src'] = msg.xpath(
'.//div[contains(@class,"pic W_piccut_v")]/img/@src')
# print(item)
data.append(deepcopy(item))
return data
def save_data(self, data):
if data:
print('data:',data)
with open('weibo.csv', 'a+') as f:
for i in data:
print('单条记录:',i)
img_src="".join(i['img_src'])
c="".join(i['content'])
f.write(c + ',' +
i['author'] + ',' +
i['time'] + ',' +
i['forward'] + ',' +
i['praised'] + ',' +
i['comment'] + ',' +
img_src+'\r\n')
def run(self):
urls = self.get_url()
for url in urls:
data = self.parse_url(url)
self.save_data(data)
print('第{}页'.format(self.page_num))
self.page_num += 1
if __name__ == '__main__':
weibo=Weibo()
weibo.run()