scrapy安装
打开一个终端输入
python -m pip install --upgrade pip
pip install wheel
pip install lxml
pip install twisted
pip install pywin32
pip install scrapy
创建项目
打开一个终端输入
cd desktop
scrapy startproject 项目名字
cd 项目名字
scrapy genspider txms 项目网站
修改setting
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 1
#默认请求头
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent':'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36'
}
# 定义的是管道优先级
ITEM_PIPELINES = {
'TXmovies.pipelines.TxmoviesPipeline': 300,
}
写爬虫程序
创建xpath文件
import scrapy
# 回退一层文件夹
from..items import TxmoviesItem
class TxmsSpider(scrapy.Spider):
name = 'xpath'
allowed_domain = ['v.qq.com']
# 开始地址
start_urls = ['https://v.qq.com/x/bu/pagesheet/list?append=1&channel=cartoon&iarea=1&listpage=2&offset=0&pagesize=30']
# 位移
offset = 0
def parse(self, response):
# 定义数据结构,用来放接线后的数据
items = TxmoviesItem()
# 将页面解析数据放到列表,每个页面有多个这种div,解析完后就有多条,存到列表
lists = response.xpath('//div[@class="list_item"]')
# 便利列表将数据,存到数据结构items
for i in lists:
items['name'] = i.xpath('./a/@title').get()
# 从每个div提取电影描述
items['description'] = i.xpath('./div/div/@title').get()
#移交控制权给管理
yield items
# 若为位移小于120
if self.offset < 120:
# 位移修改
self.offset += 30
url = 'https://v.qq.com/x/bu/pagesheet/list?append=1&channel=cartoon&iarea=1&listpage=2&offset={}&pagesize=30'.format(
str(self.offset))
# 移交控制权,回调parse函数
yield scrapy.Request(url=url, callback=self.parse)
使用管道输出
使用pipelines.py文件
from itemadapter import ItemAdapter
class TxmoviesPipeline(object):
def process_item(self, item, spider):
print(item)
return item
run
创建run.py文件运行
from scrapy import cmdline
# 执行命令行:'scrapy crawl xpath'
cmdline.execute('scrapy crawl xpath'.split())