安装顺序:
pip install wheel
whl 文件 http://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted
pip install 找到相对应版本的 twisted 文件
pip install scrapy
生成项目
命令行执行 scrapy 查看安装状态
命令生成项目
scrapy startproject 项目名称
使用IDE(pycharm) 打开项目
配置解析器 interpreter
创建爬虫 cd 项目名称
scrapy genspider example 爬取网站的域名
制作执行入口:
内容
# -*- coding: utf-8 -*- from scrapy.cmdline import execute import sys import os sys.path.append(os.path.dirname(os.path.abspath(__file__))) execute(["scrapy", "crawl", "项目名称"])
配置应用:
数据库配置:
HTTPERROR_ALLOWED_CODES = [400,302] #允许状态码范围内继续爬去 MYSQL_HOST = "127.0.0.1" MYSQL_DBNAME = "heypik" MYSQL_USER = "root" MYSQL_PASSWORD = "root"
数据库执行扩展配置:
ITEM_PIPELINES = { 'ArticleSpider.pipelines.MysqlTwistedPipline': 300, }
爬取网站爬虫日志:
一般情况为 False
ROBOTSTXT_OBEY = False
数据库应用层设置:items.py
sql组装:
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class FlatspiderItem(scrapy.Item):
label = scrapy.Field()
def get_insert_sql(self):
insert_sql = '''
insert ignore into freepik (label,sponsor,keywords,author_link,create_time,auth_id,downloads,`like`,total_resources,pv,detail_id,detail_title,json,dfu) values ('%s')
'''
params = (
self['label'],
)
return insert_sql,params
执行应用:
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from twisted.enterprise import adbapi
import MySQLdb
import MySQLdb.cursors
class ArticlespiderPipeline(object):
def process_item(self, item, spider):
# title = item.get('title')
keyword = item.get('keyword')
return item
class MysqlTwistedPipline(object):
def __init__(self,dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls,settings):
dbparms = dict(
host=settings["MYSQL_HOST"],
db=settings["MYSQL_DBNAME"],
user=settings["MYSQL_USER"],
passwd=settings["MYSQL_PASSWORD"],
charset='utf8',
cursorclass=MySQLdb.cursors.DictCursor,
use_unicode=True,
)
dbpool = adbapi.ConnectionPool('MySQLdb', **dbparms)
return cls(dbpool)
def process_item(self, item, spider):
query = self.dbpool.runInteraction(self.do_insert, item)
query.addErrback(self.handle_error, item, spider)
def handle_error(self, failure, item, spider):
print(failure)
print('--------------------------------')
def do_insert(self, cursor, item):
insert_sql, params = item.get_insert_sql()
sql = insert_sql%"','".join(params)
res = cursor.execute(sql)
if res:
print('=====**insert**=======')
else:
print('========pass==========')
爬取类型:
正常同步网站采用css选择器:
def parse(self, response): post_list = response.css('获取html对象') post_list = response.css('::attr(具体内容 eg:链接 文本)').extract_first('')
请求方式:
异步请求
yield Request(url=next_page, headers=self.headers, callback=self.parse)
同步请求
Request(url=next_page, headers=self.headers, callback=self.parse)
异步网站采集方式:
1.可用浏览器工具 通过Network 查看网站发送AJAX请求以获取数据源
# -*- coding: utf-8 -*-
# @Time : 2017/4/9 14:32
# @Author : woodenrobot
import re
import json
from scrapy import Request
from scrapy.spiders import Spider
from scrapyspider.items import DoubanMovieItem
class DoubanAJAXSpider(Spider):
name = 'douban_ajax'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36',
}
def start_requests(self):
url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start=0&limit=20'
yield Request(url, headers=self.headers)
def parse(self, response):
datas = json.loads(response.body)
item = DoubanMovieItem()
if datas:
for data in datas:
item['ranking'] = data['rank']
item['movie_name'] = data['title']
item['score'] = data['score']
item['score_num'] = data['vote_count']
yield item
# 如果datas存在数据则对下一页进行采集
page_num = re.search(r'start=(\d+)', response.url).group(1)
page_num = 'start=' + str(int(page_num)+20)
next_url = re.sub(r'start=\d+', page_num, response.url)
yield Request(next_url, headers=self.headers)
特殊情况 可采用正则匹配方式采集:
def go_detail(self, response):
detail_main = response.css('.detail ::attr(data-id)').extract_first('')
detail_title = response.css('.thumb ::attr(alt)').extract_first('')
post_list = response.text
node = re.compile('setDetailAttributes\(\"\d+\",\s(\{.*?\})\);', re.DOTALL)
# node = re.compile('setDetailAttributes\(\"\d+\",\s(\{.*?mkl:\"(.*?)\".*?\})\);' ,re.DOTALL)
node = node.findall(post_list)
for data in node:
print(detail_title + detail_main)
data = data.replace('\n', '')
data = data.replace(':"', '":"')
data = data.replace(': "', '":"')
data = data.replace('",', '","')
data = data.replace('{', '{"')
json_str = data
data = json.loads(data)
insertData = FlatspiderItem()
insertData['label'] = data['mkl']
insertData['sponsor'] = data['mks']
insertData['keywords'] = data['mk']
insertData['author_link'] = data['alink']
insertData['create_time'] = data['cd']
insertData['auth_id'] = data['aid']
insertData['downloads'] = data['dc']
insertData['like'] = data['cl']
insertData['total_resources'] = data['atr']
insertData['pv'] = data['ph']
insertData['detail_id'] = detail_main
insertData['detail_title'] = detail_title
insertData['json'] = json_str.replace("'",'')
insertData['dfu'] = data['dfu']
yield insertData