前言
最近在家太无聊,开始看以前在慕课学的课程。顺便学习下scrapy这个以前半懂不懂的框架。Python最火爬虫框架Scrapy入门与实践
代码
首先新建项目
在cmd中输入scrapy startobject xxx
第二创建爬虫
进入spider文件夹下,scrapy genspider 自己爬虫名字 域名
第三确定目标
编写item.py确定要保存的字段
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class DoubanItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
#序号,电影名称,电影介绍,星级,电影评论数,电影描述
serial_number = scrapy.Field()
movie_name = scrapy.Field()
introduce = scrapy.Field()
star = scrapy.Field()
evaluate = scrapy.Field()
depict = scrapy.Field()
第四解析网页获取自己要保存的字段
# -*- coding: utf-8 -*-
import scrapy
from items import DoubanItem
class DoubanSpiderSpider(scrapy.Spider):
#爬虫名字
name = 'douban_spider'
#允许域名
allowed_domains = ['movie.douban.com']
#入库url
start_urls = ['https://movie.douban.com/top250']
def parse(self, response):
movie_list = response.xpath('//div[@class="article"]//ol[@class="grid_view"]/li')
for i in movie_list:
douban_item = DoubanItem()
douban_item['serial_number'] = i.xpath(".//div[@class ='item']//em/text()")\
.extract_first()
douban_item['movie_name'] = i.xpath(
".//div[@class='info']//div[@class='hd']/a/span[@class = 'title'][1]/text()")\
.extract_first()
content = i.xpath(".//div[@class='info']//div[@class='bd']/p[1]/text()").extract()
for j in content:
content_i = "".join(j.split())
douban_item['introduce'] = content_i
douban_item['star'] = i.xpath(".//span[@class='rating_num']/text()").extract_first()
douban_item['evaluate'] = i.xpath(".//div[@class = 'star']//span[4]/text()").extract_first()
douban_item['depict'] = i.xpath(".//p[@class='quote']/span/text()").extract_first()
# douban_item['describe'] = douban_item['describe'].strip()
yield douban_item
next_link = response.xpath("//span[@class = 'next']/link/@href").extract()
if next_link:
next_link = next_link[0]
yield scrapy.Request('https://movie.douban.com/top250'+next_link,callback=self.parse)
第五编写mysql链接
编写pipline.py
class MySQLPipeline(object):
def __init__(self):
self.connect = connect(
host='0.0.0.0',
port=3306,
db='scrapy',
user='root',
passwd='xxxxx',
charset='utf8',
use_unicode=True)
# 连接数据库
self.cursor = self.connect.cursor()
# 使用cursor()方法获取操作游标
def process_item(self, item, spider):
self.cursor.execute(
"""INSERT INTO douban (serial_number, movie_name, introduce, star, evaluate, depict) VALUES (%s, %s, %s, %s, %s, %s)""",
(item['serial_number'],
item['movie_name'],
item['introduce'],
item['star'],
item['evaluate'],
item['depict']
))
# 执行sql语句,item里面定义的字段和表字段一一对应
self.connect.commit()
# 提交
return item
# 返回item
def close_spider(self, spider):
self.cursor.close()
# 关闭游标
self.connect.close()
# 关闭数据库连接
注意!字段不能为describe、desc等,这是mysql保留关键字。这个坑查了很久!再次提醒自己复习mysql。。。。。
第六更改middleware.py隐藏自己
import random
# user agent 列表
USER_AGENT_LIST = [
'MSIE (MSIE 6.0; X11; Linux; i686) Opera 7.23',
'Opera/9.20 (Macintosh; Intel Mac OS X; U; en)',
'Opera/9.0 (Macintosh; PPC Mac OS X; U; en)',
'iTunes/9.0.3 (Macintosh; U; Intel Mac OS X 10_6_2; en-ca)',
'Mozilla/4.76 [en_jp] (X11; U; SunOS 5.8 sun4u)',
'iTunes/4.2 (Macintosh; U; PPC Mac OS X 10.2)',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:5.0) Gecko/20100101 Firefox/5.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:9.0) Gecko/20100101 Firefox/9.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:16.0) Gecko/20120813 Firefox/16.0',
'Mozilla/4.77 [en] (X11; I; IRIX;64 6.5 IP30)',
'Mozilla/4.8 [en] (X11; U; SunOS; 5.7 sun4u)'
]
# 随机生成user agent
USER_AGENT = random.choice(USER_AGENT_LIST)
借鉴python爬虫之scrapy中user agent浅谈(两种方法)
第七更改setting.py
55行开启middleware,67行开启pipline
DOWNLOAD_DELAY为设置延迟更改0…5
USER_AGENT修改(如果使有middleware则可以不用)