scrapy框架爬取虎牙直播有关信息

主程序hy.py:

# -*- coding: utf-8 -*-
import scrapy
from ..items import sortItem,gameInfo,gameSonSort,houseInfo
from scrapy import Request
import re
from time import sleep


class HySpider(scrapy.Spider):
    name = 'hy'
    allowed_domains = ['huya.com']
    start_urls = ['http://huya.com/g']

    def parse(self, response):
        Sort=sortItem()
        urls = response.xpath("//div[@class='filter']/dl[1]/dd[position()=5]/a/@href").extract()
        names=response.xpath("//div[@class='filter']/dl[1]/dd[position()=5]/a/span/text()").extract()
        for  i in range(len(urls)):
            Sort['Surl']=urls[i]
            Sort['Sname']=names[i]
            yield Sort
            url=urls[i]
            yield response.follow(url,self.parseSort)

    def parseSort(self,response):
        game=gameInfo()
        gameName=response.xpath('//ul[@id="js-game-list"]/li/@title').extract()
        gameUrl=response.xpath('//ul[@id="js-game-list"]/li/a/@href').extract()
        gameImg=response.xpath('//ul[@id="js-game-list"]/li/a/img/@src').extract()
        gameGid=response.xpath('//ul[@id="js-game-list"]/li/a/@report').extract()
        str=re.compile('"game_id":"(.*)"}')
        for i in range(len(gameImg)-1):
            game['gameName']=gameName[i]
            game['gameImg']=gameImg[i]
            game['gameGid']=str.findall(gameGid[i])
            game['gameUrl']=gameUrl[i]
            url=gameUrl[i]
            yield game
            yield response.follow(url,self.parseInfo)
    #直播间
    def parseInfo(self,response):
        lab=gameSonSort()
        lable=response.xpath('//ul[@id="js-live-list"]/li/a[2]/text()').extract()
        houseUrl=response.xpath('//ul[@id="js-live-list"]/li/a[2]/@href').extract()
        for i in range(len(lable)-1):
            lab['lable']=lable[i]
            lab['labhouse']=houseUrl[i]
            #url=houseUrl[i]
            #yield response.follow(url,self.parseHouse)
            yield lab

    def parseHouse(self,response):
        house=houseInfo()
        houseId=response.xpath('//div[@class="host-detail J_roomHdDetail"]/span[@class="host-rid"]/em/text()').extract()
        #houseHot=response.xpath('//div[@class="host-detail J_roomHdDetail"]/span[@class="host-spectator"]/em/text()').extract()
        houseTitle=response.xpath('//div[@class="host-title"]/h1/text()').extract()
        for i in range(len(houseId)-1):
            house['houseId']=houseId[i]
           # house['houseHot']=houseHot[i]
            house['houseTitle']=houseTitle[i]
            yield house

items.py:

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy

#虎牙主分区
class sortItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    table_name='sortItem'
    Surl=scrapy.Field()
    Sname=scrapy.Field()

#分区每个子分类信息
class gameInfo(scrapy.Item):
    table_name='gameInfo'
    gameName=scrapy.Field()
    gameUrl=scrapy.Field()
    gameImg=scrapy.Field()
    gameGid=scrapy.Field()

#子分区的分类标签
class gameSonSort(scrapy.Item):
    table_name='gameSonSort'
    lable=scrapy.Field()
    labhouse=scrapy.Field()

class houseInfo(scrapy.Item):
    table_name='houseInfo'
    houseId=scrapy.Field()
    houseHot=scrapy.Field()
    houseTitle=scrapy.Field()

运行scrapy:

scrapy crawl hy -o item.json

部分运行结果:

{"gameName": "\u5929\u5929\u70ab\u6597", "gameImg": "https://huyaimg.msstatic.com/cdnimage/game/1713-MS.jpg", "gameGid": ["1713"], "gameUrl": "https://www.huya.com/g/1713"},
{"gameName": "\u5267\u672c\u6740\u624b\u6e38", "gameImg": "https://huyaimg.msstatic.com/cdnimage/game/100093-MS.jpg", "gameGid": ["100093"], "gameUrl": "https://www.huya.com/g/100093"},
{"gameName": "\u5267\u672c\u6740", "gameImg": "https://huyaimg.msstatic.com/cdnimage/game/4343-MS.jpg", "gameGid": ["4343"], "gameUrl": "https://www.huya.com/g/4343"},
{"gameName": "\u8f90\u5c04\uff1a\u907f\u96be\u6240Online", "gameImg": "https://huyaimg.msstatic.com/cdnimage/game/4165-MS.jpg", "gameGid": ["4165"], "gameUrl": "https://www.huya.com/g/4165"},
{"gameName": "SKY\u5149\u9047", "gameImg": "https://huyaimg.msstatic.com/cdnimage/game/3719-MS.jpg", "gameGid": ["3719"], "gameUrl": "https://www.huya.com/g/3719"},
{"gameName": "\u5b9d\u85cf\u4e16\u754c", "gameImg": "https://huyaimg.msstatic.com/cdnimage/game/2960-MS.jpg", "gameGid": ["2960"], "gameUrl": "https://www.huya.com/g/2960"},
{"gameName": "\u5251\u7075\uff1a\u9769\u547d", "gameImg": "https://huyaimg.msstatic.com/cdnimage/game/4545-MS.jpg", "gameGid": ["4545"], "gameUrl": "https://www.huya.com/g/4545"},
{"gameName": "\u6562\u8fbe\u4e89\u950b\u5bf9\u51b3", "gameImg": "https://huyaimg.msstatic.com/cdnimage/game/3083-MS.jpg", "gameGid": ["3083"], "gameUrl": "https://www.huya.com/g/3083"},
{"lable": "\u5929\u4ebf\u753b\u753b\u4e3b\u64ad \u62db\u4e3b\u64ad", "labhouse": "https://www.huya.com/xiaoyuankuku"},
{"lable": "\u9759\u9759\uff1a\u4eca\u5929\u51fa\u95e8\uff0c\u5e26\u4fe9\u4fdd\u9556", "labhouse": "https://www.huya.com/452455"},
{"lable": "\u53cc\u7cfb\u7edf\u67e5\u53f7\uff0c\u6218\u529b\u6307\u5bfc", "labhouse": "https://www.huya.com/19482934"},
{"lable": "\u4e0b\u5468\u4e00\u665a\u4e0a9\u70b9\u534a\u62bd\u9e21\u7fc5\u4f50\u52a9\u3001\u8d85\u5f71\u3001\u5f71", "labhouse": "https://www.huya.com/18176078"},
{"lable": "\u7c89\u4e1d\u724c\u5b50\u514d\u8d39\u8d85\u5f71,\u9a6c\u8f66\u4e00\u54e5\u5b89\u6392", "labhouse": "https://www.huya.com/17869248"},
{"lable": "\u7fbd\u7ffc\u5e26\u5168\u7cfb\u7edf\u4fee\u7f57\u56e2\u672c\uff0c\u63a5\u9a6c\u8f66\u62a4\u9001\u3002", "labhouse": "https://www.huya.com/yuyituanben"},
{"lable": "\u5361\u724c\u5b50\u52a0\u7fa4\u76f4\u63a5\u4fee\u7f57 \u516b\u7ea7\u4ee5\u4e0a\u7c89\u4e1d\u724c\u9001\u5f71", "labhouse": "https://www.huya.com/20075456"},
{"lable": "V0\u7684\u5347\u6218\u8def", "labhouse": "https://www.huya.com/17668300"},
{"lable": "\u5468\u672b\u5feb\u4e50\u5fcd\u8005", "labhouse": "https://www.huya.com/18349833"},
{"lable": "\u8ba9\u8001\u673d\u597d\u597d\u75bc\u7231\u4f60\u4eec\u5427", "labhouse": "https://www.huya.com/817490"},
  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值