~~~python
!/usr/bin/env python
-- coding: utf-8 --
@Time : 2018/3/19 14:39
@Author : 马贝贝
@Software: PyCharm
@Project : catch
@File : weibo.py
@warning : read only but owner
import scrapy
from jihui import items
class Weibo_spider(scrapy.Spider):
name = ‘weibo’
def start_requests(self):
headers = {'User-Agent': 'Baiduspider',
'Referer': 'https://weibo.com/1986481745/GqoLd2aJe?type=comment'}
url = 'https://weibo.com/u/1355702654?topnav=1&wvr=6&topsug=1&is_all=1'
request = scrapy.Request(url=url, callback=self.parse, headers=headers)
return [request]
def parse(self, response):
html = response.text
# print(html)
names = response.xpath('//a[@class="W_f14 W_fb S_txt1"]/text()').extract()
messages = response.xpath('//div[@class="WB_text W_f14"]/text()').extract()
guanzhus = response.xpath('//a[@class="t_link S_txt1"]/@href')[0].extract()
print(names)
# item = items.Weib()
# for i in range(len(names)):
# item['name'] = names[i]
# # item['message'] = messages[i]
# yield item
#
url = 'https:' + str(guanzhus)
# url = response.urljoin(url)
headers = {'User-Agent': 'Baiduspider',
'Referer': 'https://weibo.com/1986481745/GqoLd2aJe?type=comment'}
cookies = {
'YF-V5-G0': 'b4445e3d303e043620cf1d40fc14e97a',
'YF-Page-G0': '23b9d9eac864b0d725a27007679967df',
'YF-Ugrow-G0':'ea90f703b7694b74b62d38420b5273df',
'ALF': '1563585334',
'SSOLoginState': '1532049338',
'SCF': 'Aib6m_mUTqadQmMAoZq7J_9clSBXQdF1OGlbn1soqxRFtuLFhrtB0ehAgW7dHSfArb1MhCfYOHkIYTQLOPnoVUo.',
'SUB': '_2A252VUfqDeRhGeBN7FoT8izJyjmIHXVVIz4irDV8PUNbmtANLVDNkW9NRCSnrGf71dc8XV4qK_gQ0-9emoBn87zN',
'SUBP': '0033WrSXqPxfM725Ws9jqgMF55529P9D9WFN_uVX-4U9o3TZ-BImu1X75JpX5K2hUgL.Foq0S0nEeozfeK-2dJLoIp7LxKML1KBLBKnLxKqL1hnLBoMce0MReozESK2f',
'SUHB': '0z1wvou-PLWcic',
}
if guanzhus:
request = scrapy.Request(url=url, callback=self.parse1, cookies=cookies, headers=headers)
return [request]
else:
request = scrapy.Request(
url='https://weibo.com/p/1004061223178222/follow?from=page_100406&wvr=6&mod=headfollow#place',
callback=self.parse1, cookies=cookies, headers=headers)
return [request]
def parse1(self, response):
html = response.text
print(html)
purls = response.xpath('//a[@class="S_txt1"]/@href').extract()[3:]
urls = []
for purl in purls:
urls1 = 'https://weibo.com' + str(purl)
urls.append(urls1)
cookies = {
'TC-V5-G0': '7975b0b5ccf92b43930889e90d938495',
'TC-Page-G0': '42b289d444da48cb9b2b9033b1f878d9',
'ALF': '1563553997',
'SSOLoginState': '1532017998',
'SCF': 'Aud4Qht-IQymz54AxTy2x57hPd3u54y-t_fmBH7ovC58f5BSWqNPP4AOdobviOs0Rsb2ozRFfJ6CFeMpEkFBM9I.',
'SUB': '_2A252VM0fDeRhGeBN7FoT8izJyjmIHXVVI7nXrDV8PUNbmtAKLUbSkW9NRCSnrGnywfcei2GHiOlNACguqxHxKzRm',
'SUBP': '0033WrSXqPxfM725Ws9jqgMF55529P9D9WFN_uVX-4U9o3TZ-BImu1X75JpX5K2hUgL.Foq0S0nEeozfeK-2dJLoIp7LxKML1KBLBKnLxKqL1hnLBoMce0MReozESK2f',
'SUHB': '00xZwFmwDvehKM',
}
headers = {'User-Agent': 'Baiduspider',
'Referer': 'https://weibo.com/1986481745/GqoLd2aJe?type=comment'}
meta = {
'refer_flag': '1005050006_',
'is_hot': '1'
}
for url in urls:
request = scrapy.Request(url=url, cookies=cookies, callback=self.parse, meta=meta, headers=headers)
yield request
~~~