主程序hy.py:
# -*- coding: utf-8 -*-
import scrapy
from ..items import sortItem,gameInfo,gameSonSort,houseInfo
from scrapy import Request
import re
from time import sleep
class HySpider(scrapy.Spider):
name = 'hy'
allowed_domains = ['huya.com']
start_urls = ['http://huya.com/g']
def parse(self, response):
Sort=sortItem()
urls = response.xpath("//div[@class='filter']/dl[1]/dd[position()=5]/a/@href").extract()
names=response.xpath("//div[@class='filter']/dl[1]/dd[position()=5]/a/span/text()").extract()
for i in range(len(urls)):
Sort['Surl']=urls[i]
Sort['Sname']=names[i]
yield Sort
url=urls[i]
yield response.follow(url,self.parseSort)
def parseSort(self,response):
game=gameInfo()
gameName=response.xpath('//ul[@id="js-game-list"]/li/@title').extract()
gameUrl=response.xpath('//ul[@id="js-game-list"]/li/a/@href').extract()
gameImg=response.xpath('//ul[@id="js-game-list"]/li/a/img/@src').extract()
gameGid=response.xpath('//ul[@id="js-game-list"]/li/a/@report').extract()
str=re.compile('"game_id":"(.*)"}')
for i in range(len(gameImg)-1):
game['gameName']=gameName[i]
game['gameImg']=gameImg[i]
game['gameGid']=str.findall(gameGid[i])
game['gameUrl']=gameUrl[i]
url=gameUrl[i]
yield game
yield response.follow(url,self.parseInfo)
#直播间
def parseInfo(self,response):
lab=gameSonSort()
lable=response.xpath('//ul[@id="js-live-list"]/li/a[2]/text()').extract()
houseUrl=response.xpath('//ul[@id="js-live-list"]/li/a[2]/@href').extract()
for i in range(len(lable)-1):
lab['lable']=lable[i]
lab['labhouse']=houseUrl[i]
#url=houseUrl[i]
#yield response.follow(url,self.parseHouse)
yield lab
def parseHouse(self,response):
house=houseInfo()
houseId=response.xpath('//div[@class="host-detail J_roomHdDetail"]/span[@class="host-rid"]/em/text()').extract()
#houseHot=response.xpath('//div[@class="host-detail J_roomHdDetail"]/span[@class="host-spectator"]/em/text()').extract()
houseTitle=response.xpath('//div[@class="host-title"]/h1/text()').extract()
for i in range(len(houseId)-1):
house['houseId']=houseId[i]
# house['houseHot']=houseHot[i]
house['houseTitle']=houseTitle[i]
yield house
items.py:
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
#虎牙主分区
class sortItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
table_name='sortItem'
Surl=scrapy.Field()
Sname=scrapy.Field()
#分区每个子分类信息
class gameInfo(scrapy.Item):
table_name='gameInfo'
gameName=scrapy.Field()
gameUrl=scrapy.Field()
gameImg=scrapy.Field()
gameGid=scrapy.Field()
#子分区的分类标签
class gameSonSort(scrapy.Item):
table_name='gameSonSort'
lable=scrapy.Field()
labhouse=scrapy.Field()
class houseInfo(scrapy.Item):
table_name='houseInfo'
houseId=scrapy.Field()
houseHot=scrapy.Field()
houseTitle=scrapy.Field()
运行scrapy:
scrapy crawl hy -o item.json
部分运行结果:
{"gameName": "\u5929\u5929\u70ab\u6597", "gameImg": "https://huyaimg.msstatic.com/cdnimage/game/1713-MS.jpg", "gameGid": ["1713"], "gameUrl": "https://www.huya.com/g/1713"},
{"gameName": "\u5267\u672c\u6740\u624b\u6e38", "gameImg": "https://huyaimg.msstatic.com/cdnimage/game/100093-MS.jpg", "gameGid": ["100093"], "gameUrl": "https://www.huya.com/g/100093"},
{"gameName": "\u5267\u672c\u6740", "gameImg": "https://huyaimg.msstatic.com/cdnimage/game/4343-MS.jpg", "gameGid": ["4343"], "gameUrl": "https://www.huya.com/g/4343"},
{"gameName": "\u8f90\u5c04\uff1a\u907f\u96be\u6240Online", "gameImg": "https://huyaimg.msstatic.com/cdnimage/game/4165-MS.jpg", "gameGid": ["4165"], "gameUrl": "https://www.huya.com/g/4165"},
{"gameName": "SKY\u5149\u9047", "gameImg": "https://huyaimg.msstatic.com/cdnimage/game/3719-MS.jpg", "gameGid": ["3719"], "gameUrl": "https://www.huya.com/g/3719"},
{"gameName": "\u5b9d\u85cf\u4e16\u754c", "gameImg": "https://huyaimg.msstatic.com/cdnimage/game/2960-MS.jpg", "gameGid": ["2960"], "gameUrl": "https://www.huya.com/g/2960"},
{"gameName": "\u5251\u7075\uff1a\u9769\u547d", "gameImg": "https://huyaimg.msstatic.com/cdnimage/game/4545-MS.jpg", "gameGid": ["4545"], "gameUrl": "https://www.huya.com/g/4545"},
{"gameName": "\u6562\u8fbe\u4e89\u950b\u5bf9\u51b3", "gameImg": "https://huyaimg.msstatic.com/cdnimage/game/3083-MS.jpg", "gameGid": ["3083"], "gameUrl": "https://www.huya.com/g/3083"},
{"lable": "\u5929\u4ebf\u753b\u753b\u4e3b\u64ad \u62db\u4e3b\u64ad", "labhouse": "https://www.huya.com/xiaoyuankuku"},
{"lable": "\u9759\u9759\uff1a\u4eca\u5929\u51fa\u95e8\uff0c\u5e26\u4fe9\u4fdd\u9556", "labhouse": "https://www.huya.com/452455"},
{"lable": "\u53cc\u7cfb\u7edf\u67e5\u53f7\uff0c\u6218\u529b\u6307\u5bfc", "labhouse": "https://www.huya.com/19482934"},
{"lable": "\u4e0b\u5468\u4e00\u665a\u4e0a9\u70b9\u534a\u62bd\u9e21\u7fc5\u4f50\u52a9\u3001\u8d85\u5f71\u3001\u5f71", "labhouse": "https://www.huya.com/18176078"},
{"lable": "\u7c89\u4e1d\u724c\u5b50\u514d\u8d39\u8d85\u5f71,\u9a6c\u8f66\u4e00\u54e5\u5b89\u6392", "labhouse": "https://www.huya.com/17869248"},
{"lable": "\u7fbd\u7ffc\u5e26\u5168\u7cfb\u7edf\u4fee\u7f57\u56e2\u672c\uff0c\u63a5\u9a6c\u8f66\u62a4\u9001\u3002", "labhouse": "https://www.huya.com/yuyituanben"},
{"lable": "\u5361\u724c\u5b50\u52a0\u7fa4\u76f4\u63a5\u4fee\u7f57 \u516b\u7ea7\u4ee5\u4e0a\u7c89\u4e1d\u724c\u9001\u5f71", "labhouse": "https://www.huya.com/20075456"},
{"lable": "V0\u7684\u5347\u6218\u8def", "labhouse": "https://www.huya.com/17668300"},
{"lable": "\u5468\u672b\u5feb\u4e50\u5fcd\u8005", "labhouse": "https://www.huya.com/18349833"},
{"lable": "\u8ba9\u8001\u673d\u597d\u597d\u75bc\u7231\u4f60\u4eec\u5427", "labhouse": "https://www.huya.com/817490"},