scrapy的中文文档
https://scrapy-chs.readthedocs.io/zh_CN/0.24/
在windows下必须安装pypiwin32
pip install scrapy
pip install pypiwin32
在win10上报错需要安装 Twisted-20.3.0-cp38-cp38-win32.whl
报错的部分内容如下:
ERROR: Command errored out with exit status 1:
command: ‘c:\users\15870\appdata\local\programs\python\python37-32\python.exe’ -u -c ‘import sys, setuptools, tokenize; sys.argv[0] = ‘"’"‘C:\Users\15870\AppData\Local\Temp\pip-install-2wcyweho\wordcloud\setup.py’"’"’; file=’"’"‘C:\Users\15870\AppData\Local\Temp\pip-install-2wcyweho\wordcloud\setup.py’"’"’;f=getattr(tokenize, ‘"’"‘open’"’"’, open)(file);code=f.read().replace(’"’"’\r\n’"’"’, ‘"’"’\n’"’"’);f.close();exec(compile(code, file, ‘"’"‘exec’"’"’))’ install --record ‘C:\Users\15870\AppData\Local\Temp\pip-record-9qx3thr5\install-record.txt’ --single-version-externally-managed --compile
Twisted-20.3.0-cp38-cp38-win32.whl 下载地址https://www.lfd.uci.edu/~gohlke/pythonlibs/
安装
pip install ./Twisted-20.3.0-cp38-cp38-win32.whl
在ubuntu上安装scrapy之前需要先安装依赖
suod apt-get install python-dev python-pip libxm12-dev libxsltl-dev zliblg-dev libffi-dev libssl-dev
然后在安装 scrapy
pip install scrapy
创建项目
要使用scrapy框架创建项目
scrapy startproject [项目名称]
使用命令创建一个爬虫
scrapy genspider [爬虫名] [域名]
运行项目
scrapy crawl [爬虫名称]
爬取内容
# -*- coding: utf-8 -*-
import scrapy
import bdbk.items
from copy import deepcopy #深拷贝
import re
class QcwxjsSpider(scrapy.Spider):
name = 'qcwxjs'
allowed_domains = ['www.qcwxjs.com']
start_urls = ['https://www.autohome.com.cn/tech/3/#liststart']
def parse(self, response):
# print(response.body.decode(response.encoding))
aa =re.findall('<script type="text/javascript">(.*?)</script>',response.body.decode(response.encoding))
print(aa)
list = response.xpath("//div[@id='auto-channel-lazyload-article']/ul/li")
for li in list:
item = li.xpath(".//a/@href").extract_first()
url = item.split('.html')
urls = "https:"+ url[0]+'-all.html'+url[1]
yield scrapy.Request(urls,callback= self.getInfo, dont_filter=True)
# 获取下一页
next_page_url = "https://www.autohome.com.cn/" + response.xpath('//div[@id="channelPage"]//a[contains(text(),"下一页")]/@href').extract()[0]
if next_page_url is not None:
yield scrapy.Request(next_page_url, callback=self.parse, dont_filter=True)
def getInfo(self, response):
item = bdbk.items.BdbkItem()
item['title'] = response.xpath("//div[@class='container article']//h1/text()").extract()
item['yyr'] = response.xpath("//div[@class='container article']//span[@class='time']/text()").extract_first()
item['action'] = response.xpath("//div[@class='container article']//div[@id='articleContent']//text()").extract()
item['imgUrl'] = response.xpath("//div[@class='container article']//div[@id='articleContent']//img/@src").extract()
# print(item['action'])
# print(response.meta['item']);
yield item
在settings.py中设置
LOG_LEVEL = "WARNING" # 输出日志错误级别
# LOG_FILE = "./log.txt" 日志输出到本地
USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
在pipelines.py中处理爬取到的数据
import re
class BdbkPipeline(object):
def open_spider(self, spider): #爬虫开启会执行一次 只执行一次 // 这里可以用来连接数据库
print('爬虫开始执行')
def close_spider(self, spider): #爬虫结束后执行 只执行一次
print('爬虫结束后执行')
def process_item(self, item, spider):
item['title'] = self.process_content(item['title'])
item['yyr'] = ''.join(self.process_content(item['yyr']))
item['action'] = ''.join(self.process_content(item['action']))
print(item)
return item
def process_content(self, content):
content = [re.sub(r"\r|\n|\s","",i) for i in content]
return content
在items.py中定义需要爬取到的数据
import scrapy
class BdbkItem(scrapy.Item):
title = scrapy.Field()
yyr = scrapy.Field()
action = scrapy.Field()
imgUrl = scrapy.Field()
Scrapy中CrawlSpider的使用
生成crawlspider的命令:
scrapy genspider --t crawl [项目名称] [爬取的域名]
例如:
scrapy genspider --t crawl qczj autohome.com.cn
LinkExtractor: 链接提取器。
主要参数:
1.allow:满足括号中“正则表达式”的值会被提取,如果为空,则会全部匹配。
2.deny:与这个正则表达式(或正则表达式列表)不匹配的URL一定不提取
3.allow_domains:会被提取的链接domains。
4.deny_domains:一定不会被提取链接的domains
5.restrick_xpaths:使用xpath表达式,和allow共同作用过滤链接
LinkExtractor(
allow=r'Items/',# 满足括号中“正则表达式”的值会被提取,如果为空,则全部匹配。
deny=xxx, # 满足正则表达式的则不会被提取。
restrict_xpaths=xxx, # 满足xpath表达式的值会被提取
restrict_css=xxx, # 满足css表达式的值会被提取
deny_domains=xxx, # 不会被提取的链接的domains。
)
rules
在rules中包含一个或多个Rule对象
每个Rule对爬取网站的动作定义了特定的操作。
如果多个rule匹配了相同的链接,则根据规则在本集合中被定义的顺序,第一个会被使用
rules参数的介绍
link_extractors:是一个LinkExtractor对象,用于定义需要提取的链接
callback:从link_extractor中没获取链接时,参数所制定的值作为回调函数,该回调函数接受一个response作为起第一个参数
注意:当编写爬虫规则是,避免使用parse作为回调函数。由于CrawlSpider使用parse方法来实现其逻辑,如果覆盖了parse方法,CrawlSpider将会运行失败
follow:是一个布尔值(boolean),制定了根据该规则从response提取的链接是偶需要跟进。如果callback为None,follow默认设置为True,否则默认为Flase
process_links:指定该Spider中那个的函数将会被调用,从link_extractor中获取到链接列表是将会调用该函数。该方法主要用来过滤
process_request:指定该Spider中那个的函数将会被调用,该规则提取到每个request是都会调用该函数。(用来过滤request)
使用 CrawlSpider
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class QczjSpider(CrawlSpider):
name = 'qczj'
allowed_domains = ['www.autohome.com.cn']
start_urls = ['https://www.autohome.com.cn/tech/1/#liststart']
# allow 正则 callback 回调函数 follow
rules = (
Rule(LinkExtractor(allow=r'/tech/202004/\d+\.html#pvareaid=102624'), callback='parse_item'), #抓取详情页
Rule(LinkExtractor(allow=r'/tech/\d+\/#liststart'), follow=True), # 分页
)
def parse_item(self, response):
channel =response.xpath("//div[@id='articlewrap']/h1/text()").extract_first()
print(channel)
scrapy中使用cookies
爬139邮箱
获取cookie
# -*- coding: utf-8 -*-
import scrapy
import re
class QqemalSpider(scrapy.Spider):
name = 'qqemal'
allowed_domains = ['appmail.mail.10086.cn']
start_urls = ['https://appmail.mail.10086.cn/m6/html/index.html?sid=00U4Njc0MTQwNjAwMTkyMTAy000C58C3000004&rnd=612&tab=&comefrom=54&v=&k=9667&cguid=0930000138494&mtime=56&h=1']
#重写start_requests方法
def start_requests(self):
cookes = "_139_index_isSimLogin=0; UUIDToken=4afc50aa-6607-446a-b345-5431eee12a19; _139_index_login=15867411864850926030579662; _139_index_isSmsLogin=1; pwdKey=46b91fdbdfecd0a97b6aec8eb0c31de9; sid=sid9107791ca56aa7c95f266dc9f60619dd; umckey=c743ccb52fd8bf0eb7d9d7ad7e5b23a4; PICTUREUIN=z3TxlBa82BTORKtplViHiw==; PICTURELOGIN=NGMyYTY5YWQ0MzM5OTNjMDQ3NDg4NjQ0ZDNmYTNiYnwxMzYwNjk2Njd8MTU4Njc0MTQwNjMxOHxSSUNISU5GTzg4OA==; agentid=311e3dc3-0aec-4d40-a4ba-67cf66fbe8b8; RMKEY=b0553db038fc1fe3; Os_SSo_Sid=00U4Njc0MTQwNjAwMTkyMTAy000C58C3000004; cookiepartid9667=12; ut9667=2; cookiepartid=12; Login_UserNumber=15178866572; UserData={}; SkinPath29667=; rmUin9667=208808947; provCode9667=31; areaCode9667=3102; _139_login_version=60; welcome=s%3ACbkjj8o1dNMHGtBEGuBZogkW0ZaVfMXL.nSZ7FTOywqyQLRdjv6xXrEl9UKVvvbshedhVKDBkppY; loginProcessFlag="
COOKISE = {i.split("=")[0]: i.split("=")[1] for i in cookes.split("; ")}
yield scrapy.Request(
self.start_urls[0],
callback= self.parse,
cookies = COOKISE
)
def parse(self, response):
print(response.body.decode(response.encoding))
aa = re.findall('首页', response.body.decode(response.encoding))
print(aa)
下载中间件
配置多个user-agent
USER_AGENT_LIST = [
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)", #QQ浏览器
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)" #360
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)", #傲游
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)", #IE 8.0
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0", #IE 9.0
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko", #IE 11
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1",
]
在Middleware设置User-Agent
def process_request(self, request, spider):
proxy = random.choice(spider.settings.get('PROXIES'))
#设置代理IP
# request.meta["proxy"] = proxy
us = random.choice(spider.settings.get('USER_AGENT_LIST'))
request.headers["User-Agent"] = us
return None
def process_response(self, request, response, spider):
print(request.headers["User-Agent"])
return response
scrapy发送POST请求
import scrapy
class GiteeSpider(scrapy.Spider):
name = 'gitee'
allowed_domains = ['gitee.com']
start_urls = ['https://gitee.com/login']
def parse(self, response):
urlList = response.xpath("//body").extract()
print(urlList)
input_tokin = response.xpath("//form//input[@name='authenticity_token']/@value").extract_first()
commit = response.xpath("//form//input[@name='commit']/@value").extract_first()
ga_id = response.xpath("//form//input[@name='ga_id']/@value").extract_first()
webauthnsupport = response.xpath("//form//input[@name='webauthn-support']/@value").extract_first()
webauthniuvpaasupport = response.xpath("//form//input[@name='webauthn-iuvpaa-support']/@value").extract_first()
return_to = response.xpath("//form//input[@name='return_to']/@value").extract_first()
timestamp = response.xpath("//form//input[@name='timestamp']/@value").extract_first()
timestamp_secret = response.xpath("//form//input[@name='timestamp_secret']/@value").extract_first()
post_data = {
"commit": commit,
"authenticity_token": input_tokin,
"ga_id": ga_id,
"login" : "342730241@qq.com",
"password" : "yubo@0128",
"webauthn-support": webauthnsupport,
"webauthn-iuvpaa-support": webauthniuvpaasupport,
"return_to": return_to,
"required_field_9ef7": "",
"timestamp": timestamp,
"timestamp_secret": timestamp_secret
}
yield scrapy.FormRequest(
"https://github.com/session",
formdata = post_data,
callback= self.login
)
def login(self, response):
print(response)