scrapy爬虫
爬取单个页面
douban.py
import scrapy
from scrapy import Selector
from spiderTest.items import MovieItem
class DoubanSpider(scrapy.Spider):
name = "douban"
allowed_domains = ["movie.douban.com"]
start_urls = ["https://movie.douban.com/top250"]
def parse(self, response):
sel = Selector(response)
list_items = sel.css('#content > div > div.article > ol > li')#拿到25个列表项
for list_item in list_items:#看f12的页面元素分析
movie_item = MovieItem()
#键和属性要一一对应
movie_item['title'] = list_item.css('span.title::text').extract_first()#class用. , .css返回的仍然是选择器对象,加上extract_first后就可以抽取其中的字符串
movie_item['rank'] = list_item.css('span.rating_num::text').extract_first()
movie_item['subject'] = list_item.css('span.inq::text').extract_first()
yield movie_item#用生成器的方式返回,不能return
#运行爬虫文件并保存,默认支持的格式是.csv, .xml, .json
items.py
import scrapy
#组装数据为item对象
class MovieItem(scrapy.Item):#一个对象代表一个电影的数据
# define the fields for your item here like:
# name = scrapy.Field()
#要把数据组装成item对象,定义数组有哪些字段
title = scrapy.Field()#名称要和解析时写的名字对应
rank = scrapy.Field()
subject = scrapy.Field()
settings.py
修改user_agent , 下载延迟, 并发数
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36"
# CONCURRENT_REQUESTS = 32
#下载延迟
DOWNLOAD_DELAY = 3
#随机化下载延迟
#RANDOMIZE_DOWNLOAD_DELAY = True
运行爬虫文件并保存,默认支持的格式是.csv, .xml, .json
scrapy crawl douban -o douban.csv
爬取多个页面
需要翻页栏的url
douban.py
import scrapy
from scrapy import Selector, Request
from spiderTest.items import MovieItem
from scrapy.http import HtmlResponse
class DoubanSpider(scrapy.Spider):
name = "douban"
allowed_domains = ["movie.douban.com"]
start_urls = ["https://movie.douban.com/top250"]
def parse(self, response:HtmlResponse):
sel = Selector(response)
list_items = sel.css('#content > div > div.article > ol > li')#拿到25个列表项
for list_item in list_items:#看f12的页面元素分析
movie_item = MovieItem()
#键和属性要一一对应
movie_item['title'] = list_item.css('span.title::text').extract_first()#class用. , .css返回的仍然是选择器对象,加上extract_first后就可以抽取其中的字符串
movie_item['rank'] = list_item.css('span.rating_num::text').extract_first()
movie_item['subject'] = list_item.css('span.inq::text').extract_first()
yield movie_item#用生成器的方式返回,不能return
#运行爬虫文件并保存,默认支持的格式是.csv, .xml, .json
hrefs_list = sel.css('div.paginator > a::attr(href)')#属性用::attr()
for href in hrefs_list:
url_ori = href.extract()#拿到的地址并不完整
#拼接时不能用字符串拼接--通过响应对象拼接
url = response.urljoin(url_ori)
#吧url构造成request对象
yield Request(url = url)
直接这样会报错,因为在第2页时也会拿到后面第5页连接,第三页也会拿到第5页链接,会多拿数据—更好的方法是吧上面的start_url重写成一个方法start_requests
import scrapy
from scrapy import Selector, Request
from spiderTest.items import MovieItem
from scrapy.http import HtmlResponse
class DoubanSpider(scrapy.Spider):
name = "douban"
allowed_domains = ["movie.douban.com"]
def start_requests(self):
#要爬十个页面,直接在这里构建十个请求,之后就不用解析url了
for page in range(10):#页码
yield Request(url=f'https://movie.douban.com/top250?start={page*25}&filter=')
def parse(self, response:HtmlResponse):
sel = Selector(response)
list_items = sel.css('#content > div > div.article > ol > li')#拿到25个列表项
for list_item in list_items:#看f12的页面元素分析
movie_item = MovieItem()
#键和属性要一一对应
movie_item['title'] = list_item.css('span.title::text').extract_first()#class用. , .css返回的仍然是选择器对象,加上extract_first后就可以抽取其中的字符串
movie_item['rank'] = list_item.css('span.rating_num::text').extract_first()
movie_item['subject'] = list_item.css('span.inq::text').extract_first()
yield movie_item#用生成器的方式返回,不能return
使用管道将数据写入excel文件
需要三方库openpyxl支持,创建工作簿,向里面添加工作表,然后添加数据并保存
TIPS:
pip freeze > requirements.txt 将自己的环境运行库保存为文件,方便别人读取和安装
pip install -r requirements.txt
pipelines.py
from itemadapter import ItemAdapter
import openpyxl
#这里的方法都是钩子函数,不需要自己主动调
#用于处理数据
class SpidertestPipeline:
def __init__(self):#在这里建立只会有一个工作簿,只调用一次
self.wb = openpyxl.Workbook()#工作簿
# 默认工作表是wb.activate
self.ws = self.wb.active # wb.create_sheet()#创建一张新的工作表
self.ws.title = 'TOP250'#表名
self.ws.append(('标题', '评分', '主题'))#表头
def close_spider(self, spider):#关闭爬虫时保存表,只调用一次
self.wb.save('电影数据.xlsx')
def process_item(self, item, spider):#每拿到一条数据执行一次
#最好不要用键值对去拿数据,可能有空值----用字典中的get,可以设置默认值
title = item.get('title', '')
rank = item.get('rank', '')
subject = item.get('subject', '')
self.ws.append((title, rank, subject))#组装成三元组写成一行
return item
settings.py
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
"spiderTest.pipelines.SpidertestPipeline": 300,#数字表示优先级,越小越先执行
}
使用管道把数据写入数据库
数据表如下:
需要设置用户权限,对数据表可以操作(增删改查即可)
需要第三方库pymysql支持
pipelines.py
from itemadapter import ItemAdapter
import openpyxl
import pymysql
#这里的方法都是钩子函数,不需要自己主动调
#用于处理数据
#可以用两个pipeline同时把数据写入excel和数据库
class DbPipeline:
def __init__(self):#建立连接
self.conn = pymysql.connect(host = '127.0.0.1', port = 3306,
user = '', password = '',
database = 'spider', charset = 'utf8')
self.cursor = self.conn.cursor()
def close_spider(self, spider):#释放
self.conn.commit()
self.conn.close()
def process_item(self, item, spider):#每拿到一条数据执行一次,向数据库表中Insert
title = item.get('title', '')
rank = item.get('rank', 0)
subject = item.get('subject', '')
self.cursor.execute(
'insert into tb_top_movie (title, rating, subject) values (%s, %s, %s)',
(title, rank, subject)
)
return item
class ExcelPipeline:
def __init__(self):#在这里建立只会有一个工作簿,只调用一次
self.wb = openpyxl.Workbook()#工作簿
# 默认工作表是wb.activate
self.ws = self.wb.active # wb.create_sheet()#创建一张新的工作表
self.ws.title = 'TOP250'#表名
self.ws.append(('标题', '评分', '主题'))#表头
def close_spider(self, spider):#关闭爬虫时保存表,只调用一次
self.wb.save('电影数据.xlsx')
def process_item(self, item, spider):#每拿到一条数据执行一次
#最好不要用键值对去拿数据,可能有空值----用字典中的get,可以设置默认值
title = item.get('title', '')
rank = item.get('rank', 0)
subject = item.get('subject', '')
self.ws.append((title, rank, subject))#组装成三元组写成一行
return item
settings.py
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
"spiderTest.pipelines.ExcelPipeline": 300,#数字表示优先级,越小越先执行
"spiderTest.pipelines.DbPipeline": 400,
}
上面的写入数据是一条一条写入的,可以优化为批量写入:
pipelines.py
from itemadapter import ItemAdapter
import openpyxl
import pymysql
#这里的方法都是钩子函数,不需要自己主动调
#用于处理数据
#可以用两个pipeline同时把数据写入excel和数据库
class DbPipeline:
def __init__(self):#建立连接
self.conn = pymysql.connect(host = '127.0.0.1', port = 3306,
user = '', password = '',
database = 'spider', charset = 'utf8')
self.cursor = self.conn.cursor()
self.data = []#装载数据的容器
def close_spider(self, spider):#释放
#以防最后剩下的不够100个无法写入,需要做个检查
if len(self.data) > 0:
self.cursor.executemany(
'insert into tb_top_movie (title, rating, subject) values (%s, %s, %s)',
self.data
)
self.conn.commit()
self.conn.close()
def process_item(self, item, spider):#每拿到一条数据执行一次,向数据库表中Insert
title = item.get('title', '')
rank = item.get('rank', 0)
subject = item.get('subject', '')
#这里可以改成批处理,效率更高,就不用一条一条插入了
self.data.append((title, rank, subject))#这样的缺点是消耗内存
if len(self.data) == 100:
self.cursor.executemany(
'insert into tb_top_movie (title, rating, subject) values (%s, %s, %s)',
self.data
)
self.conn.commit()
self.data.clear()#把原来列表中的数据清空,等待下次插入
return item
class ExcelPipeline:
def __init__(self):#在这里建立只会有一个工作簿,只调用一次
self.wb = openpyxl.Workbook()#工作簿
# 默认工作表是wb.activate
self.ws = self.wb.active # wb.create_sheet()#创建一张新的工作表
self.ws.title = 'TOP250'#表名
self.ws.append(('标题', '评分', '主题'))#表头
def close_spider(self, spider):#关闭爬虫时保存表,只调用一次
self.wb.save('电影数据.xlsx')
def process_item(self, item, spider):#每拿到一条数据执行一次
#最好不要用键值对去拿数据,可能有空值----用字典中的get,可以设置默认值
title = item.get('title', '')
rank = item.get('rank', 0)
subject = item.get('subject', '')
self.ws.append((title, rank, subject))#组装成三元组写成一行
return item
其中,插入的语句多次重复,可以抽取出来为单独方法;
VSCODE和PYCHARM中的右键重构可以很方便的自动生成
def _write_to_db(self):
self.cursor.executemany(
'insert into tb_top_movie (title, rating, subject) values (%s, %s, %s)',
self.data
)
self.conn.commit()
如果爬取的太多被ban
1、使用代理
在Request中添加meta参数,添加代理
(使用socks代理需要pysocks库)
douban.py中修改:
失败的话切换成全局代理尝试
或者下载中间件中修改:DownloaderMiddleware
def process_request(self, request, spider):
request.meta = {‘proxy’:‘’}
2、cookie(302报错)
我们可以利用 cookie 来位维持登录状态。下面我们以 github 为例来说明一下,首先我们登录 github(https://github.com/),然后将 headers 中的 cookies 内容复制下来:
然后将其添加到 headers 里面, 再去发送请求:
import requests
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36’,
“Cookie”: “_octo=GH1.1.1915931890.1579427905; _ga=GA1.2.1547718271.1579427925; _device_id=5657001e5c0569e098b1ae01c9394996; user_session=_xk7oDT5pQF4obUnAmSezjU6ILfiQ-lkj9K7bQewMBOnE33F; __Host-user_session_same_site=_xk7oDT5pQF4obUnAmSezjU6ILfiQ-lkj9K7bQewMBOnE33F; tz=Asia%2FShanghai; logged_in=yes; dotcom_user=furuiyang0715; has_recent_activity=1; _gat=1; _gh_sess=Ra7wkF7VEmlhDOnrHB6HNGbIrfu14B2ARjynFzt6w4yAMmGs3CZHFik1CbUoEdYxnORGUvet7LSvvtoO7Z%2F0Khnp57qNuiNqjFNmMt7vEm8EoYnjHR0YYWJ0Rj38EBG9M%2BGsyO%2B9cbI5PVdy%2FZ2Lx9eNJBwXgcare9KVw82f46%2F%2FIqt8nROAd1giSuN60lkKgh6U2Ak4CWdHsClanzNV8Vize%2BwPCMyknHYk65xTMy0L6pe26uAN7mvKYlH3%2FQJUbHl6tgadrTrcheidBpRN1lmKB5xVAP2WwkYdSBIB%2BWHK46127yCLzgvf%2B8saoEafiFXLrvBmFuWSGWnEmDdxBQxrELKDrWiOAPMqtSelh4cAJRbI3FUwDmL4gSYeV1UWqmL8k%2FlTENJX2THzGB0jKXw7aGCDBLUZisQapUvDlBDnA%2FX%2F0aPl80BMh783nDfiUg8lsIG099xt5H%2BWSZ3fkJn0d72oSXuC%2BAGTntaM1c6DX48iWNixiQ7UpEvcF92rb0O1Kmt64bmFt6thttc2uek%2BhzB4Mzs9WctqnE4uJ1hEL3mqgEhNnlnmZZCQS4FaJ%2BmcnyHlT2XIt81jLeM0slCckBLBDEn9uF85ZtVRgIR7WpuW0UmW8i6MjVjhFOYJo4QeiX57lUZwzvFu4b0XYNT%2FzHGuDf2OpeGkKfKYqm5j3hwoNG1caRAYz5WUSAQp3qkEeg830aYsyAZV8TQnfUZgJuVUvckQiOdclmZz88f4wsMYMIXy%2BsO3CQ%2F72EjtTcCLCPnFtGI42qURTKaTMfe1n7c1hCdgWWrCD1l2%2F9ZOIuiDK%2FDW1YbONGjqKxbQCwv5Nt796VQ4RcsbFFGY90XFeFdfkhRM3rHRVFyvbSvLm81UdNZCBUSUt%2FMef%2FCOGzd1iEZtx0AYUCZu1uMLLIa2KDXtHP6vpxXl9iL34wgrjfOpgn%2FHRCr%2B%2FUHAGhsZE4gPiE%2FrtSwUEn8r4jTFWGl281omsVM6BUGjg1TQXziApyTPRvuNdjFveUGtddWB1wTE9u6KWHTRpHHgnoqIyHUXWBLqaKdf9kylKhQu97VH8ArC%2BlTSiDEz84B7f%2FZjNCOkbr1LqPQValDGnSoU%2BsxIvPQgIZVoU4EoXxgxHOgnuPmb4b8sQrpiugbhhozrom6PtyZAfxwDU0g6AWXaUorXd3RyRSFHIwWrG%2Fa3JY5HXPt3OuxnDOp%2BqzICgP4CoHDNtErP7U52PoEbxA6C9zHuV9TquRDmuCeifxApJjhCZNA9Jy9APTLEINsuKEI2jQaovlP7tiLygTndEcx6%2F%2F7JPDx8JOerX%2FXpqAn4bM4PIkLAQS%2FsL4DvTU0zqG%2FM9Sh9zfgaz2%2BTZsfmI74mqZI67iUHsOIsOrV1Zm4uWU7G6JxjLOnDWG1KjqnUh0zXJveE9albK6y0bLmeD8crR6jD13oyJOfeN3YYAzlw3RvCtE%2BBcwCWN8kik7YqHHTvwZyEq3baAOSbLPyyHHu15agkTAPfrL1RB78sifbNl9JAcGy0E6lzdF7uVvaBZoic1EHLBetTdLhJXKBdB9C%2FtjMrXby3s0pNeLtvd5058RTMQX8Innb4Os0hGJITakP1RgpWKG3VF46c7fv9CbSI7Y4YWrgz%2B8gvskFXTn2MXSSHdcw34idZwniM0s3QinIZ6X573Un7N3F6gWy%2F1%2BGD8ofRnoEjH2zJSxOi0Asd8TKM5DxXonSdTNhzL0qr1tEU09mp2Qh3n6cjnPokkxkweQEGQDwHzy%2BNdSWDcHD8jWoouZWMRf2F0zTdjfs5CPKU2o1%2BoHCGz6HA3idk8Z6D6fdO5VUpOHfZsVcUK3R4B5TQPG7iXu5y1iFFGYXm9MhTI%2FoBBwesJBLZVTe6HEuuJ9EuNFiVZyglnTjgrf%2B0NGf8B50r7jUjw6E1V9d4zTjCGO5U4FsZboERewN53JARdeJqIHgbPmQ925xUIQa3A04VHaks3aODv6lhbV5CK0DJ9RCjEaLn4WRT0Mivsi9xMD04oTvJzqdoTWjkBI7q9nVhEQuDTH%2F%2BKx%2Bq7To%3D–aqtWALXtWaF0M08H–8hz0UM5isekKmodpsu%2BzQA%3D%3D”
}
resp = requests.get(“https://github.com”, headers=headers)
print(resp.text)
或者:
在中间件中:
def get_cookies_dict():#把登录后的信息cookies转化成字典,代理是没有登录信息的
cookies_str = ''
cookies_dict = {}
for item in cookies_str.split(';'):
key, value = item.split('=', maxsplit=1)
cookies_dict[key] = value
return cookies_dict
COOKIES_DICT = get_cookies_dict()
在下载中间件中:
class SpidertestDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request:Request, spider):
request.cookies = COOKIES_DICT
return None
在settings.py中配置中间件
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
"spiderTest.middlewares.SpidertestDownloaderMiddleware": 543,#值越小越先执行
}
具体页面解析
并发进行无法保证顺序
douban.py
import scrapy
from scrapy import Request, Selector
from scrapy.http import HtmlResponse
from spiderTest.items import MovieItem
class DoubanSpider(scrapy.Spider):
name = "douban"
allowed_domains = ["movie.douban.com"]
def start_requests(self):
#要爬十个页面,直接在这里构建十个请求,之后就不用解析url了
for page in range(1):#页码
yield Request(url=f'https://movie.douban.com/top250?start={page*25}&filter=')
#对详情页面解析
def parse(self, response:HtmlResponse):#parse是解析页面时默认的使用方法
sel = Selector(response)
list_items = sel.css('#content > div > div.article > ol > li')#拿到25个列表项
for list_item in list_items:#看f12的页面元素分析
#每一条电影的详情链接
detail_url = list_item.css('div.info > div.hd > a::attr(href)').extract_first()#从li后面继续拿
movie_item = MovieItem()
#键和属性要一一对应
movie_item['title'] = list_item.css('span.title::text').extract_first()#class用. , .css返回的仍然是选择器对象,加上extract_first后就可以抽取其中的字符串
movie_item['rank'] = list_item.css('span.rating_num::text').extract_first()
movie_item['subject'] = list_item.css('span.inq::text').extract_first()
#此时数据还没有拿完,只有三条,不能直接yield item,应该先返回一个请求对象继续处理
yield Request(
url=detail_url,
callback=self.parse_detail,
cb_kwargs={'item':movie_item}#需要继续组装数据,补充完整
)#现在要用新的解析方法
def parse_detail(self, response, **kwargs):
movie_item = kwargs['item']
sel = Selector(response)
movie_item['duration'] = sel.css('span[property="v:runtime"]::attr(content)').extract_first()#class用. , .css返回的仍然是选择器对象,加上extract_first后就可以抽取其中的字符串
movie_item['intro'] = sel.css('span[property="v:summary"]::text').extract_first()
yield movie_item
pipelines.py
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import openpyxl
import pymysql
#这里的方法都是钩子函数,不需要自己主动调
#用于处理数据
#可以用两个pipeline同时把数据写入excel和数据库
class DbPipeline:
def __init__(self):#建立连接
self.conn = pymysql.connect(host = '127.0.0.1', port = 3306,
user = '', password = '',
database = 'spider', charset = 'utf8')
self.cursor = self.conn.cursor()
self.data = []#装载数据的容器
def close_spider(self, spider):#释放
#以防最后剩下的不够100个无法写入,需要做个检查
if len(self.data) > 0:
self._write_to_db()
self.conn.close()
def process_item(self, item, spider):#每拿到一条数据执行一次,向数据库表中Insert
title = item.get('title', '')
rank = item.get('rank', 0)
subject = item.get('subject', '')
duration = item.get('duration', '')
intro = item.get('intro', '')
#这里可以改成批处理,效率更高,就不用一条一条插入了
self.data.append((title, rank, subject, duration, intro))#这样的缺点是消耗内存
if len(self.data) == 100:
self._write_to_db()
self.data.clear()#把原来列表中的数据清空,等待下次插入
return item
def _write_to_db(self):
self.cursor.executemany(
'insert into tb_top_movie (title, rating, subject, duration, intro) values (%s, %s, %s, %s, %s)',
self.data
)
self.conn.commit()
items.py
import scrapy
#组装数据为item对象
class MovieItem(scrapy.Item):#一个对象代表一个电影的数据
# define the fields for your item here like:
# name = scrapy.Field()
#要把数据组装成item对象,定义数组有哪些字段
title = scrapy.Field()#名称要和解析时写的名字对应
rank = scrapy.Field()
subject = scrapy.Field()
duration = scrapy.Field()
intro = scrapy.Field()
WARNING:
将简介写入excel时,会出错
因为含有换行符等其他字符,excel无法接收
solution:
使用正则表达式替换
但是直接使用这样的方法遇到下面的情况会出问题:
由于中间有《br》标签把两端文字分隔开,导致只能拿到第一段文字
我的方法是:把extract_first()方法替换成getall()方法,可以得到span下的所有内容,但是是以list格式存储,然后将list中的内容直接字符串拼接的形式转化为字符串,会自动过滤掉换行符等字符,直接拿到干净的文字,甚至不用正则表达式
douban.py中进行如下修改
def parse_detail(self, response, **kwargs):
movie_item = kwargs['item']
sel = Selector(response)
movie_item['duration'] = sel.css('span[property="v:runtime"]::attr(content)').extract_first()#class用. , .css返回的仍然是选择器对象,加上extract_first后就可以抽取其中的字符串
#为了避免excel中写入简介出错
intro = sel.css('span[property="v:summary"]::text').getall()
intro = ''.join(intro)
movie_item['intro'] = intro#.strip().replace('\u3000', '').replace(' ', '').replace('\n', '')
yield movie_item
得到的excel效果:
完美的两段!