python爬取伯乐在线_python 爬取伯乐在线完整版

在jobbole.py

import scrapy

import re

import datetime

from urllib import parse

from scrapy.http import Request

from ArticleSpider.items import JobBoleArticleItem

from ArticleSpider.utils.common import get_md5

from scrapy.loader import ItemLoader

class JobboleSpider(scrapy.Spider):

name = 'jobbole'

allowed_domains = ['blog.jobbole.com']

start_urls = ['http://blog.jobbole.com/all-posts']

def parse(self, response):

"""

1. 获取文章列表页中的文章url并交给scrapy下载后并进行解析

2. 获取下一页的url并交给scrapy进行下载, 下载完成后交给parse

"""

# 解析列表页中的所有文章url并交给scrapy下载后并进行解析

post_nodes = response.css("#archive .floated-thumb .post-thumb a")

for post_node in post_nodes:

#获取封面图的url

image_url = post_node.css("img::attr(src)").extract_first("")

post_url = post_node.css("::attr(href)").extract_first("")

#request下载完成之后,回调parse_detail进行文章详情页的解析

# Request(url=post_url,callback=self.parse_detail)

yield Request(url=parse.urljoin(response.url,post_url),meta={"front_image_url":image_url},callback=self.parse_detail)

#遇到href没有域名的解决方案

#response.url + post_url

# 提取下一页并交给scrapy进行下载

next_url = response.css(".next.page-numbers::attr(href)").extract_first("")

if next_url:

yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)

def parse_detail(self, response):

# front_image_url = response.meta.get("front_image_url", "") #文章封面图

# title = response.css(".entry-header h1::text").extract_first()

# create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·","").strip()

# praise_nums = response.css(".vote-post-up h10::text").extract()[0]

# fav_nums = response.css(".bookmark-btn::text").extract()[0]

# match_re = re.match(".*?(\d+).*", fav_nums)

# if match_re:

# fav_nums = int(match_re.group(1))

# else:

# fav_nums = 0

#

# comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0]

# match_re = re.match(".*?(\d+).*", comment_nums)

# if match_re:

# comment_nums = int(match_re.group(1))

# else:

# comment_nums = 0

#

# # content = response.css("div.entry::text").extract()

# content = response.css('div.entry').extract_first()

#

# tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract()

# tag_list = [element for element in tag_list if not element.strip().endswith("评论")]

# tags = ",".join(tag_list)

# try:

# create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()

# except Exception as e:

# create_date = datetime.datetime.now().date()

# article_item = JobBoleArticleItem()

# article_item["title"] = title

# article_item["url"] = response.url

# article_item["create_date"] = create_date

# article_item["front_image_url"] = [front_image_url]

# article_item["praise_nums"] = praise_nums

# article_item["comment_nums"] = comment_nums

# article_item["fav_nums"] = fav_nums

# article_item["tags"] = tags

# article_item["content"] = content

# article_item["url_object_id"] = get_md5(response.url)

front_image_url = response.meta.get("front_image_url", "") # 文章封面图

item_loader = ItemLoader(item=JobBoleArticleItem(), response=response)

item_loader.add_css("title", ".entry-header h1::text")

item_loader.add_value("url", response.url)

item_loader.add_value("url_object_id", get_md5(response.url))

item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")

item_loader.add_value("front_image_url", [front_image_url])

item_loader.add_css("praise_nums", ".vote-post-up h10::text")

item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text")

item_loader.add_css("fav_nums", ".bookmark-btn::text")

item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")

item_loader.add_css("content", "div.entry")

#调用这个方法来对规则进行解析生成item对象

article_item = item_loader.load_item()

yield article_item

2.main.py

from scrapy.cmdline import execute

import sys

import os

sys.path.append(os.path.dirname(os.path.abspath(__file__)))

execute(["scrapy", "crawl", "jobbole"])

3.item.py

import scrapy

import datetime

import re

from scrapy.loader import ItemLoader

from scrapy.loader.processors import MapCompose, TakeFirst, Join

class ArticlespiderItem(scrapy.Item):

# define the fields for your item here like:

# name = scrapy.Field()

pass

def date_convert(value):

try:

create_date = datetime.datetime.strptime(value, "%Y/%m/%d").date()

except Exception as e:

create_date = datetime.datetime.now().date()

return create_date

def get_nums(value):

match_re = re.match(".*?(\d+).*", value)

if match_re:

nums = int(match_re.group(1))

else:

nums = 0

return nums

def remove_comment_tags(value):

#去掉tag中提取的评论

if "评论" in value:

return ""

else:

return value

def return_value(value):

return value

# class JobBoleArticleItem(scrapy.Item):

# title = scrapy.Field()

# create_date = scrapy.Field()

# url = scrapy.Field()

# url_object_id = scrapy.Field()

# front_image_url = scrapy.Field()

# front_image_path = scrapy.Field()

# praise_nums = scrapy.Field()

# comment_nums = scrapy.Field()

# fav_nums = scrapy.Field()

# content = scrapy.Field()

# tags = scrapy.Field()

class ArticleItemLoader(ItemLoader):

#自定义itemloader

default_output_processor = TakeFirst()

class JobBoleArticleItem(scrapy.Item):

title = scrapy.Field()

create_date = scrapy.Field(

input_processor=MapCompose(date_convert),

)

url = scrapy.Field()

url_object_id = scrapy.Field()

front_image_url = scrapy.Field(

output_processor=MapCompose(return_value)

)

front_image_path = scrapy.Field()

praise_nums = scrapy.Field(

input_processor=MapCompose(get_nums)

)

comment_nums = scrapy.Field(

input_processor=MapCompose(get_nums)

)

fav_nums = scrapy.Field(

input_processor=MapCompose(get_nums)

)

#因为tag本身是list,所以要重写

tags = scrapy.Field(

input_processor=MapCompose(remove_comment_tags),

output_processor=Join(",")

)

content = scrapy.Field()

4.pipelines.py

import codecs

import json

import MySQLdb

import MySQLdb.cursors

from twisted.enterprise import adbapi

from scrapy.pipelines.images import ImagesPipeline

from scrapy.exporters import JsonItemExporter

class ArticlespiderPipeline(object):

def process_item(self, item, spider):

return item

class ArticleImagePipeline(ImagesPipeline):

#重写该方法可从result中获取到图片的实际下载地址

def item_completed(self, results, item, info):

for ok, value in results:

image_file_path = value["path"]

item["front_image_path"] = image_file_path

return item

class MysqlTwistedPipline(object):

def __init__(self, dbpool):

self.dbpool = dbpool

@classmethod

def from_settings(cls, settings):

dbparms = dict(

host = settings["MYSQL_HOST"],

db = settings["MYSQL_DBNAME"],

user = settings["MYSQL_USER"],

passwd = settings["MYSQL_PASSWORD"],

charset='utf8',

cursorclass=MySQLdb.cursors.DictCursor,

use_unicode=True,

)

#**dbparms-->("MySQLdb",host=settings['MYSQL_HOST']

dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms)

return cls(dbpool)

def process_item(self, item, spider):

#使用twisted将mysql插入变成异步执行

query = self.dbpool.runInteraction(self.do_insert, item)

query.addErrback(self.handle_error, item, spider) #处理异常

def handle_error(self, failure, item, spider):

#处理异步插入的异常

print (failure)

def do_insert(self, cursor, item):

#执行具体的插入

#根据不同的item 构建不同的sql语句并插入到mysql中

insert_sql, params = item.get_insert_sql()

cursor.execute(insert_sql, params)

class JsonWithEncodingPipeline(object):

# 自定义json文件的导出

def __init__(self):

# 使用codecs打开避免一些编码问题。

self.file = codecs.open('article.json', 'w', encoding="utf-8")

def process_item(self, item, spider):

# 将item转换为dict,然后调用dumps方法生成json对象,false避免中文出错

lines = json.dumps(dict(item), ensure_ascii=False) + "\n"

self.file.write(lines)

return item

# 当spider关闭的时候: 这是一个spider_closed的信号量。

def spider_closed(self, spider):

self.file.close()

class JsonExporterPipeline(object):

#调用scrapy提供的json export导出json文件

def __init__(self):

self.file = open('articleexport.json', 'wb')

self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False)

self.exporter.start_exporting()

def close_spider(self, spider):

self.exporter.finish_exporting()

self.file.close()

def process_item(self, item, spider):

self.exporter.export_item(item)

return item

class MysqlPipeline(object):

#采用同步的机制写入mysql

def __init__(self):

self.conn = MySQLdb.connect('localhost', 'root', '123456', 'article_spider', charset="utf8", use_unicode=True)

self.cursor = self.conn.cursor()

def process_item(self, item, spider):

insert_sql = """

insert into jobbole_article(title,create_date,url,url_object_id, front_image_url,praise_nums,comment_nums,fav_nums,tags,content)

VALUES (%s, %s, %s, %s, %s,%s,%s,%s,%s,%s)

"""

self.cursor.execute(insert_sql, (item["title"],item["create_date"],item["url"],item["url_object_id"],item["front_image_url"],item["praise_nums"],item["comment_nums"],item["fav_nums"],item["tags"],item["content"]))

self.conn.commit()

5.setting.py

import os

BOT_NAME = 'ArticleSpider'

SPIDER_MODULES = ['ArticleSpider.spiders']

NEWSPIDER_MODULE = 'ArticleSpider.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent

#USER_AGENT = 'ArticleSpider (+http://www.yourdomain.com)'

# Obey robots.txt rules

ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)

#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)

# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay

# See also autothrottle settings and docs

#DOWNLOAD_DELAY = 3

# The download delay setting will honor only one of:

#CONCURRENT_REQUESTS_PER_DOMAIN = 16

#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)

#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)

#TELNETCONSOLE_ENABLED = False

# Override the default request headers:

#DEFAULT_REQUEST_HEADERS = {

# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

# 'Accept-Language': 'en',

#}

# Enable or disable spider middlewares

# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html

#SPIDER_MIDDLEWARES = {

# 'ArticleSpider.middlewares.ArticlespiderSpiderMiddleware': 543,

#}

# Enable or disable downloader middlewares

# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

#DOWNLOADER_MIDDLEWARES = {

# 'ArticleSpider.middlewares.ArticlespiderDownloaderMiddleware': 543,

#}

# Enable or disable extensions

# See https://doc.scrapy.org/en/latest/topics/extensions.html

#EXTENSIONS = {

# 'scrapy.extensions.telnet.TelnetConsole': None,

#}

# Configure item pipelines

# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html

ITEM_PIPELINES = {

'ArticleSpider.pipelines.ArticlespiderPipeline': 300,

# 'scrapy.pipelines.images.ImagesPipeline': 1,

# 'ArticleSpider.pipelines.ArticleImagePipeline':1,

'ArticleSpider.pipelines.JsonExporterPipeline':2,

# 'ArticleSpider.pipelines.MysqlPipeline': 4,

}

IMAGES_URLS_FIELD = "front_image_url"

project_dir = os.path.abspath(os.path.dirname(__file__))

IMAGES_STORE = os.path.join(project_dir, 'images')

# Enable and configure the AutoThrottle extension (disabled by default)

# See https://doc.scrapy.org/en/latest/topics/autothrottle.html

#AUTOTHROTTLE_ENABLED = True

# The initial download delay

#AUTOTHROTTLE_START_DELAY = 5

# The maximum download delay to be set in case of high latencies

#AUTOTHROTTLE_MAX_DELAY = 60

# The average number of requests Scrapy should be sending in parallel to

# each remote server

#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

# Enable showing throttling stats for every response received:

#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)

# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

#HTTPCACHE_ENABLED = True

#HTTPCACHE_EXPIRATION_SECS = 0

#HTTPCACHE_DIR = 'httpcache'

#HTTPCACHE_IGNORE_HTTP_CODES = []

#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

MYSQL_HOST = "localhost"

MYSQL_DBNAME = "article_spider"

MYSQL_USER = "root"

MYSQL_PASSWORD = "123456"

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值