创建项目和爬虫,Domains Allowed 网域允许
scrapy startproject letvlive
cd letv
scrapy genspider letvlive letv.com
# scrapy genspider letvlive www.letv.com
mian.py 启动模块
from scrapy import cmdline
cmdline.execute('scrapy crawl letvlive -o json.json'.split())
item.py 爬取目标—字段
import scrapy
class LeliveItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
nick = scrapy.Field()
image = scrapy.Field()
liveUrl = scrapy.Field()
path = scrapy.Field()
爬虫测试是否通print(response.url), 调通后, 才往下写解析函数
这里的目标是通过手机app抓出来的下面连接:
2.开始抓包得到乐视直播的链接
# -*- coding: utf-8 -*-
import scrapy
from lelive.items import LeliveItem
class LetvliveSpider(scrapy.Spider):
name = 'letvlive'
#allowed_domains = ['www.letv.com'] 出来的数据少, 不要www出来的数据多, 不知什么原因
allowed_domains = ['letv.com']
# start_urls = ['http://www.letv.com/']
page = 1
pre_url = 'http://dynamic.live.app.m.letv.com/android/dynamic.php?luamod=main&mod=live&ctl=liveHuya&act=channelList&pcode=010210000&version=8.1&channelId=2168&pages='
after_url = '&country=CN&provinceid=1&districtid=9&citylevel=1&location=%E5%8C%97%E4%BA%AC%E5%B8%82%7C%E6%9C%9D%E9%98%B3%E5%8C%BA&lang=chs®ion=CN'
start_urls = pre_url+str(page)+after_url
start_urls = [start_urls]
def parse(self, response):
print('=========', response.url)
写第一页的解析函数, 把第一页的数据取出
class LetvliveSpider(scrapy.Spider):
start_urls = pre_url+str(page)+after_url
start_urls = [start_urls]
def parse(self, response):
print('======---------------------------===', response.url)
import json
result_test = json.loads(response.text, encoding='utf-8')
#json文件, 所以用response.text, 它的结构是一个字典, 所以直接取它的key: result_test['body']['result'] , 得到value
for data in result_test['body']['result']:
# print(data)
nick = data['nick']
image = data['screenshot']
liveUrl = data['liveUrl']
# print(liveUrl)
item = LeliveItem()
item['nick'] = nick
item['image'] = image
item['liveUrl'] = liveUrl
yield item
pipeline.py, 保存第一页
import scrapy
class LeliveTextSavePipeline(object):
def open_spider(self, spider):
self.file = open('letv.json','w',encoding='utf-8')
def process_item(self, item, spider):
dict_item = dict(item)
import json
dict_str = json.dumps(dict_item,ensure_ascii=False) + '\n'
self.file.write(dict_str)
return item
def close_spider(self, spider):
self.file.close()
setting.py
ITEM_PIPELINES = {
'lelive.pipelines.LeliveTextSavePipeline': 301,
}
运行起来, 看结果, OK~, 拒后写后面很多页的内容:
class LetvliveSpider(scrapy.Spider):
.........
def parse(self, response):
print('=========', response.url)
.......
yield item
#拼后面的页码链接,并yield出数据
if self.page < 100:
self.page += 1
new_url = self.pre_url+str(self.page)+self.after_url
yield scrapy.Request(new_url,callback=self.parse)
打印测试ok
pipelines里写LeliveImagePipeline,把图片保存下来, pipelines中继承ImagesPipeline, 并重写get_media_requests方法
# 继承ImagesPipeline, 并重写get_media_requests方法
from scrapy.pipelines.images import ImagesPipeline
class LeliveImagePipeline(ImagesPipeline):
def get_media_requests(self, item, info):
image = item['image']
yield scrapy.Request(image)
setting里配置LeliveImagePipeline, 并写IMAGES_STORE路径
ITEM_PIPELINES = {
'lelive.pipelines.LeliveTextSavePipeline': 301,
'lelive.pipelines.LeliveImagePipeline': 300,
}
import os
IMAGES_STORE = os.path.dirname(os.path.realpath('__file__'))+'/images/'
图片改名, 重写ImagesPipeline的 item_completed方法, 并用os.rename(old_name,new_name)
class LeliveImagePipeline(ImagesPipeline):
def get_media_requests(self, item, info):
image = item['image']
yield scrapy.Request(image)
def item_completed(self, results, item, info):
# if isinstance(item, dict) or self.images_result_field in item.fields:
image_name = [x['path'] for ok, x in results if ok][0]
import os
old_name = IMAGES_STORE +'/' +image_name
new_name = IMAGES_STORE + '/' + item['nick'] + '.jpg'
os.rename(old_name,new_name)
item['path'] = new_name
return item