创建爬虫项目
#选择项目存储位置
#shinft + 右键:打开dos窗口
scrapy startproject douban
创建pycharm运行文件/run.py 与内层
from scrapy.cmdline import execute
execute(['scrapy','crawl','douban'])
#'douban' 与scrapy爬虫名称一致
- douban文件夹
- spiders文件夹 #爬虫文件
- items.py #获取字段定义
- setting.py #设置
- pipelines.py #数据处理
- run.py #启动项
第一步定义字段/items.py
import scrapy
class DoubanItem(scrapy.Item):
pass
url = scrapy.Field()
rank = scrapy.Field()
movie_name = scrapy.Field()
comment = scrapy.Field()
price = scrapy.Field()
第二部编写爬虫/spiders/douban_spider.py(名字自定义)
Request请求/parse出了问题
from scrapy import Request, Spider
from douban.items import DoubanItem
class DoubanScrapy(Spider):
name = 'douban'
# 爬虫起始url
start_urls = ["https://movie.douban.com/top250"]
def start_request(self):
yield Request(self.start_urls, callback = self.parse)
def parse(self, response):
for msg in response.xpath('//div[@class="item"]'):
item = DoubanItem()
item['url'] = msg.xpath('div[@class="pic"]/a/@href').extract()[0]
item['rank'] = msg.xpath('div[@class="pic"]/em/text()').extract()[0]
item['movie_name'] = msg.xpath('div[@class="pic"]/a/img/@alt').extract()[0]
item['price'] = msg.xpath('div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()').extract()[0]
comment = msg.xpath('div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span/text()').extract()
# 爬取过程中,有些没有直接的评论,做些处理
if comment:
item['comment'] = comment[0]
yield item
#获取当前url的下一页链接
next_page = response.xpath('//span[@class="next"]/a/@href').extract_first()
if next_page:
request_url = response.urljoin(next_page)
print (request_url)
yield Request(request_url, callback = self.parse)
三 爬虫被屏蔽,F12参考正常请求进行修改/settings.py(或手动传入header参数)
此例:user_agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
至此可以在运行run.py查看效果
四 数据处理(写入mysql)/piplinnes.py
1.根据自定义爬取字段,创建mysql数据表(navicat)
…
2.创建一个mysql文件夹(及其以下文件均为自建)专门处理mysql数据库
pipelines.py
sql.py
3.在sql.py操作数据表
import pymysql
from douban import settings
MYSQL_HOSTS = '127.0.0.1'
MYSQL_USER = 'root'
MYSQL_PASSWORD = ''
MYSQL_PORT = '3306'
MYSQL_DB = 'scrapy'
class Sql:
@classmethod
#关于 @ classmethod这个是一个修饰符;作用是我们不需要初始化类就可以直接调用类中的函数使用
def insert_dd_name(cls,item):
db = pymysql.connect(host='localhost', port=3306, user='root', passwd='', db='scrapy', charset='utf8')
cursor = db.cursor()
for i in range(len(item['movie_name'])):
sql = "INSERT IGNORE INTO douban (movie_name,price,comment,rank,url) VALUES ('{0}','{1}','{2}','{3}','{4}')".format(item['movie_name'][i],item['price'][i],item['comment'][0],item['rank'],item['url'])
cursor.execute(sql)
db.commit()
db.close()
4.写自定义的pipelines
from .sql import Sql
from douban.items import DoubanItem
class DoubanPipeline(object):
def process_item(self, item, spider):
if isinstance(item,DoubanItem):
Sql.insert_dd_name(item)
return item
5.settings.py启用pipelines
ITEM_PIPELINES = {
# 'douban.pipelines.DoubanPipeline': 300,
'douban.mysql.pipelines.DoubanPipeline': 300,
# DouBan(项目目录).mysql(自己建立的MySQL目录).pipelines(自己建立的pipelines文件).DouBanPipeline(其中定义的类) 后面的 1 是优先级程度(1-1000随意设置,数值越低,组件的优先级越高)
}
6.起动run.py