微博用户爬取

~~~python

!/usr/bin/env python

-- coding: utf-8 --

@Time : 2018/3/19 14:39

@Author : 马贝贝

@Software: PyCharm

@Project : catch

@File : weibo.py

@warning : read only but owner

import scrapy

from jihui import items

class Weibo_spider(scrapy.Spider):
name = ‘weibo’

def start_requests(self):
    headers = {'User-Agent': 'Baiduspider',
               'Referer': 'https://weibo.com/1986481745/GqoLd2aJe?type=comment'}
    url = 'https://weibo.com/u/1355702654?topnav=1&wvr=6&topsug=1&is_all=1'
    request = scrapy.Request(url=url, callback=self.parse, headers=headers)
    return [request]

def parse(self, response):
    html = response.text
    # print(html)
    names = response.xpath('//a[@class="W_f14 W_fb S_txt1"]/text()').extract()
    messages = response.xpath('//div[@class="WB_text W_f14"]/text()').extract()
    guanzhus = response.xpath('//a[@class="t_link S_txt1"]/@href')[0].extract()
    print(names)
    # item = items.Weib()
    # for i in range(len(names)):
    #     item['name'] = names[i]
    # # item['message'] = messages[i]
    # yield item
    #
    url = 'https:' + str(guanzhus)
    # url = response.urljoin(url)
    headers = {'User-Agent': 'Baiduspider',
               'Referer': 'https://weibo.com/1986481745/GqoLd2aJe?type=comment'}
    cookies = {
        'YF-V5-G0': 'b4445e3d303e043620cf1d40fc14e97a',
        'YF-Page-G0': '23b9d9eac864b0d725a27007679967df',
        'YF-Ugrow-G0':'ea90f703b7694b74b62d38420b5273df',
        'ALF': '1563585334',
        'SSOLoginState': '1532049338',
        'SCF': 'Aib6m_mUTqadQmMAoZq7J_9clSBXQdF1OGlbn1soqxRFtuLFhrtB0ehAgW7dHSfArb1MhCfYOHkIYTQLOPnoVUo.',
        'SUB': '_2A252VUfqDeRhGeBN7FoT8izJyjmIHXVVIz4irDV8PUNbmtANLVDNkW9NRCSnrGf71dc8XV4qK_gQ0-9emoBn87zN',
        'SUBP': '0033WrSXqPxfM725Ws9jqgMF55529P9D9WFN_uVX-4U9o3TZ-BImu1X75JpX5K2hUgL.Foq0S0nEeozfeK-2dJLoIp7LxKML1KBLBKnLxKqL1hnLBoMce0MReozESK2f',
        'SUHB': '0z1wvou-PLWcic',
    }
    if guanzhus:
        request = scrapy.Request(url=url, callback=self.parse1, cookies=cookies, headers=headers)
        return [request]
    else:
        request = scrapy.Request(
            url='https://weibo.com/p/1004061223178222/follow?from=page_100406&wvr=6&mod=headfollow#place',
            callback=self.parse1, cookies=cookies, headers=headers)
        return [request]

def parse1(self, response):
    html = response.text
    print(html)
    purls = response.xpath('//a[@class="S_txt1"]/@href').extract()[3:]
    urls = []
    for purl in purls:
        urls1 = 'https://weibo.com' + str(purl)
        urls.append(urls1)
    cookies = {
        'TC-V5-G0': '7975b0b5ccf92b43930889e90d938495',
        'TC-Page-G0': '42b289d444da48cb9b2b9033b1f878d9',
        'ALF': '1563553997',
        'SSOLoginState': '1532017998',
        'SCF': 'Aud4Qht-IQymz54AxTy2x57hPd3u54y-t_fmBH7ovC58f5BSWqNPP4AOdobviOs0Rsb2ozRFfJ6CFeMpEkFBM9I.',
        'SUB': '_2A252VM0fDeRhGeBN7FoT8izJyjmIHXVVI7nXrDV8PUNbmtAKLUbSkW9NRCSnrGnywfcei2GHiOlNACguqxHxKzRm',
        'SUBP': '0033WrSXqPxfM725Ws9jqgMF55529P9D9WFN_uVX-4U9o3TZ-BImu1X75JpX5K2hUgL.Foq0S0nEeozfeK-2dJLoIp7LxKML1KBLBKnLxKqL1hnLBoMce0MReozESK2f',
        'SUHB': '00xZwFmwDvehKM',
    }
    headers = {'User-Agent': 'Baiduspider',
               'Referer': 'https://weibo.com/1986481745/GqoLd2aJe?type=comment'}
    meta = {
        'refer_flag': '1005050006_',
        'is_hot': '1'
    }
    for url in urls:
        request = scrapy.Request(url=url, cookies=cookies, callback=self.parse, meta=meta, headers=headers)
        yield request

~~~

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值