1、代码jobbole.py写爬取策略,
2、settings.py 配置pipelines、配置图片下载、配置是否遵循robote协议、数据库配置等
3、pipelines.py 主要是配置数据存储操作
4、本来用的xpath 对网站解析,但是循环解析时发现每次解析的都是第一条,不知道是什么问题,最后这部分代码换成css选择器就好了。
一、jobbole.py(主要写爬取策略)
# -*- coding: utf-8 -*-
import json
import os
import re
import sys
from urllib import parse
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from utils.utils import Util
from items import JobbolespiderItem, ArticleItemLoader
import scrapy
from scrapy import Request
from scrapy.loader import ItemLoader
class JobboleSpider(scrapy.Spider):
name = 'jobbole'
allowed_domains = ['news.cnblogs.com']
start_urls = ['http://news.cnblogs.com/']
# def parse(self, response):
# jobbolespiderItem = JobbolespiderItem()
# jobbolespiderItem['front_image_url'] = ['https://images2018.cnblogs.com/news_topic/20180515154619133-1755088138.png']
# yield jobbolespiderItem
def parse(self, response):
item_selecters = response.css('#news_list .news_block')
# item_selecters = response.xpath('//div[@id="news_list"]/div[@class="news_block"]')
for item_selecter in item_selecters:
# 循环中用xpath会出现问题,
print(item_selecter.extract())
front_image_url = item_selecter.css('.entry_summary a img::attr(src)').extract_first('')
if front_image_url.startswith('//'):
front_image_url = 'https:' + front_image_url
url = item_selecter.css('h2 a::attr(href)').extract_first("")
# front_image_url = item_selecter.xpath('//div[@class="entry_summary"]/a/img/@src').extract_first('')
# url = item_selecter.xpath('//div[@class="content"]/h2/a/@href').extract_first('')
# 请求详情数据
print(url)
yield Request(parse.urljoin(response.url, url), meta={"front_image_url": front_image_url},
callback=self.parse_detail)
last_test = response.xpath('//div[@class="pager"]/a[last()]/text()').extract_first('')
if last_test == 'Next >':
# 请求下一页数据
next_url = response.xpath('//div[@class="pager"]/a[last()]/@href').extract_first('')
yield Request(parse.urljoin(response.url, next_url), callback=self.parse)
def parse_detail(self, response):
# jobbolespiderItem = JobbolespiderItem()
# if response.meta.get('front_image_url'):
# jobbolespiderItem['front_image_url'] = [parse.urljoin(response.url, response.meta.get('front_image_url'))]
# else:
# jobbolespiderItem['front_image_url'] = []
# if response.xpath('//div[@id="news_title"]/a/text()').extract_first(''):
# jobbolespiderItem['title'] = response.xpath('//div[@id="news_title"]/a/text()').extract_first('')
# else:
# jobbolespiderItem['title'] = ''
# if response.xpath('//div[@id="news_info"]/span[@class="time"]/text()').extract_first(''):
# create_date_content = response.xpath('//div[@id="news_info"]/span[@class="time"]/text()').extract_first('')
# else:
# create_date_content = ''
# print(create_date_content)
# try:
# if re.match(r'发布于 (.*)', create_date_content).group(1):
# jobbolespiderItem['create_date'] = re.match(r'发布于 (.*)', create_date_content).group(1)
# else:
# jobbolespiderItem['create_date'] = '1970-01-01'
# except Exception as e:
# print(e)
# jobbolespiderItem['content'] = response.xpath('//div[@id="news_content"]/div[@id="news_body"]').extract_first(
# '')
# tag_list = response.xpath('//div[@class="news_tags"]/a/text()').extract()
# jobbolespiderItem['tags'] = ','.join(tag_list)
# https: // news.cnblogs.com / NewsAjax / GetPreNewsById?contentId = 665930
# print(response.url)
# url_new = parse.urljoin(response.url, '/NewsAjax/GetAjaxNewsInfo?contentId={}'.format(id))
item_loader = ArticleItemLoader(item=JobbolespiderItem(),response=response)
item_loader.add_xpath('title','//div[@id="news_title"]/a/text()')
item_loader.add_xpath('content','//div[@id="news_content"]/div[@id="news_body"]')
item_loader.add_xpath('tags','//div[@class="news_tags"]/a/text()')
item_loader.add_xpath('create_date','//div[@id="news_info"]/span[@class="time"]/text()')
item_loader.add_value("url", response.url)
if response.meta.get('front_image_url'):
item_loader.add_value('front_image_url',parse.urljoin(response.url, response.meta.get('front_image_url')))
article_item = item_loader.load_item()
yield Request(parse.urljoin(response.url, '/NewsAjax/GetAjaxNewsInfo?contentId={}'.format(re.match(r'.*?(\d+)', response.url).group(1))), callback=self.parse_nums, meta={'article_item': article_item})
def parse_nums(self, response):
jobbolespiderItem = response.meta.get('article_item')
if jobbolespiderItem.get('front_image_url'):
jobbolespiderItem['image_url_id'] = Util().trans_md5(jobbolespiderItem.get('front_image_url')[0])
r_json = json.loads(response.text)
jobbolespiderItem['content_id'] = r_json.get('ContentID')
jobbolespiderItem['comment_count'] = r_json.get('CommentCount')
jobbolespiderItem['total_view'] = r_json.get('TotalView')
jobbolespiderItem['digg_count'] = r_json.get('DiggCount')
jobbolespiderItem['bury_count'] = r_json.get('BuryCount')
yield jobbolespiderItem
二、settings.py
# -*- coding: utf-8 -*-
# Scrapy settings for AricleSpider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
import os
BOT_NAME = 'AricleSpider'
SPIDER_MODULES = ['AricleSpider.spiders']
NEWSPIDER_MODULE = 'AricleSpider.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'AricleSpider (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
# COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False
# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
# }
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# 'AricleSpider.middlewares.AriclespiderSpiderMiddleware': 543,
# }
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# 'AricleSpider.middlewares.AriclespiderDownloaderMiddleware': 543,
# }
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
# }
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'AricleSpider.pipelines.AricleImagePipeline': 1,
# 'AricleSpider.pipelines.AricleSaveJsonPipeline': 2,
# 'AricleSpider.pipelines.AricleSaveDBPipeline': 3,
'AricleSpider.pipelines.MysqlTwistedPipeline': 4,
# 'AricleSpider.pipelines.AriclespiderPipeline': 300
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# 图片下载存放地址
IMAGES_URLS_FIELD = 'front_image_url' # 图片路径名的配置
img_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'images')
print(img_path)
IMAGES_STORE = img_path
MYSQL_HOST = "127.0.0.1"
MYSQL_DBNAME = "jobbole"
MYSQL_USER = "root"
MYSQL_PASSWORD = "root"
3、pipelines.py 数据存储
# -*- coding: utf-8 -*-
import json
import os
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
from scrapy.pipelines.images import ImagesPipeline
from twisted.enterprise import adbapi
class AriclespiderPipeline(object):
def process_item(self, item, spider):
return item
class AricleImagePipeline(ImagesPipeline):
# 把下载图片和对应的本地地址放在一个对象中
def item_completed(self, results, item, info):
if 'front_image_url' in item:
for ok, value in results:
# value 存放图片url和本地存储path
image_file_path = value["path"]
item["front_image_path"] = image_file_path
return item
class AricleSaveJsonPipeline(object):
# 存储数据到json中
def __init__(self):
path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'ceshi.json')
self.f = open(path, "a", encoding='utf-8')
def process_item(self, item, spider):
itme_json = json.dumps(dict(item), ensure_ascii=False) + '\n'
self.f.write(itme_json)
return item
class AricleSaveDBPipeline(object):
# 存储数据到mysql中
def __init__(self):
# 连接database
self.conn = pymysql.connect(
host="127.0.0.1",
user="root", password="root",
database="article_spider",
charset="utf8")
# 得到一个可以执行SQL语句的光标对象
self.cursor = self.conn.cursor() # 执行完毕返回的结果集默认以元组显示
def process_item(self, item, spider):
sql = '''
INSERT INTO jobbole_article
(front_image_url, create_date,image_url_id,title,content,tags,content_id,comment_count,total_view,digg_count,bury_count)
VALUES
("{}","{}","{}","{}",'{}',"{}",{},{},{},{},{}) on DUPLICATE KEY UPDATE parise_nums=VALUES(bury_count);;
'''
sql = sql.format(','.join(item.get('front_image_url')),
item.get('create_date'),
item.get('image_url_id'),
item.get('title'),
item.get('content'),
item.get('tags'),
item.get('content_id'),
item.get('comment_count'),
item.get('total_view'),
item.get('digg_count'),
item.get('bury_count'))
print(sql)
try:
self.cursor.execute(sql)
self.conn.commit()
except Exception as e:
print('=====error:{}'.format(e))
return item
class MysqlTwistedPipeline:
# 异步执行sql语句
def __init__(self, dbpool):
self.dbpool = dbpool
def process_item(self, item, spider):
query = self.dbpool.runInteraction(self.do_insert, item)
query.addErrback(self.handler_error, item, spider)
def do_insert(self, cursor, item):
sql = '''
INSERT INTO jobbole_article
(front_image_url, create_date,image_url_id,title,content,tags,content_id,comment_count,total_view,digg_count,bury_count,url_object_id)
VALUES
("{}","{}","{}","{}",'{}',"{}",{},{},{},{},{},'0') on DUPLICATE KEY UPDATE title=VALUES(title),tags=VALUES(tags),content=VALUES(content);
'''
sql = sql.format(','.join(item.get('front_image_url','')),
item.get('create_date',''),
item.get('image_url_id',''),
item.get('title',''),
# 'xxx',
item.get('content',''),
# '大象汽车',
item.get('tags',''),
item.get('content_id'),
item.get('comment_count'),
item.get('total_view'),
item.get('digg_count'),
item.get('bury_count'))
print(sql)
cursor.execute(sql)
def handler_error(self, failure, item, spider):
print(failure)
@classmethod
def from_settings(cls, settings):
from MySQLdb.cursors import DictCursor
dbparms = dict(
host=settings["MYSQL_HOST"],
db=settings["MYSQL_DBNAME"],
user=settings["MYSQL_USER"],
passwd=settings["MYSQL_PASSWORD"],
charset='utf8',
cursorclass=DictCursor,
use_unicode=True,
)
dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms)
return cls(dbpool)
4、items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import re
import scrapy
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, MapCompose, Identity,Join
class AriclespiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
class ArticleItemLoader(ItemLoader):
default_output_processor = TakeFirst()
def add_test(value):
return value + '++'
def add_jobbole(value):
return value + '--'
def date_convert(value):
if re.match(r'发布于 (.*)', value).group(1):
create_date = re.match(r'发布于 (.*)', value).group(1)
else:
create_date = '1970-01-01'
return create_date
class JobbolespiderItem(scrapy.Item):
title = scrapy.Field(
# input_processor = MapCompose(add_jobbole,add_test),
# output_processor=TakeFirst()
)
# create_date_content = scrapy.Field()
create_date = scrapy.Field(
input_processor= MapCompose(date_convert),
output_processor=TakeFirst()
)
content = scrapy.Field()
tags = scrapy.Field(
output_processor =Join(separator=',')
)
url = scrapy.Field()
front_image_url = scrapy.Field(
output_processor=Identity()
)
image_url_id = scrapy.Field()
front_image_path = scrapy.Field()
content_id = scrapy.Field()
comment_count = scrapy.Field()
total_view = scrapy.Field()
digg_count = scrapy.Field()
bury_count = scrapy.Field()