python爬取伯乐在线_Python爬取伯乐在线网站

Python3.5+Scrapy爬取伯乐在线的博客文章

创建虚拟环境

mkvirtualenv -p C:\Users\Joseph\AppData\Local\Programs\Python\Python35\python.exe article_spider

pip install Twisted-17.5.0-cp35-cp35m-win_amd64.whl

pip install -i https://pypi.douban.com/simple/ scrapy

使用命令行创建scrapy项目

scrapy startproject ArticleSpider

创建基本模板

cd ArticleSpider

scrapy genspider jobbole blog.jobbole.com

启动scrapy

scrapy crawl jobbole

Windows下错误

错误:缺少win32模块

解决方法:pip install -i https://pypi.douban.com/simple pypiwin32

为了便于断点调试,新建main.py

from scrapy.cmdline import execute

import sys

import os

sys.path.append(os.path.dirname(os.path.abspath(__file__)))

execute(["scrapy", "crawl", "jobbole"])

注意:robots协议需要为False

在 settings.py中

# Obey robots.txt rules

ROBOTSTXT_OBEY = False

避免每次都获取页面,可以用shell调试scrapy

scrapy shell http://blog.jobbole.com/110287/

xpath提取元素

title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first()

create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract_first().replace("·",

"").strip()

praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract_first()

fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract_first()

match_fav_nums = re.match(r".*?(\d+).*?", fav_nums)

if match_fav_nums:

fav_nums = match_fav_nums.group(1)

comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract_first()

match_comment_nums = re.match(r".*?(\d+).*?", comment_nums)

if match_comment_nums:

comment_nums = match_comment_nums.group(1)

content = response.xpath("//div[@class='entry']").extract_first()

tag_list = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract()

tag_list = [element for element in tag_list if not element.strip().endswith("评论")]

tags = ",".join(tag_list)

css选择器提取元素

title = response.css(".entry-header h1::text").extract_first()

create_date = response.css("p.entry-meta-hide-on-mobile::text").extract_first().replace("·", "").strip()

praise_nums = response.css(".vote-post-up h10::text").extract_first()

fav_nums = response.css(".bookmark-btn::text").extract_first()

match_fav_nums = re.match(r".*?(\d+).*?", fav_nums)

if match_fav_nums:

fav_nums = match_fav_nums.group(1)

comment_nums = response.css("a[href='#article-comment'] span::text").extract_first()

match_comment_nums = re.match(r".*?(\d+).*?", comment_nums)

if match_comment_nums:

comment_nums = match_comment_nums.group(1)

content = response.css("div.entry").extract_first()

tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract()

tag_list = [element for element in tag_list if not element.strip().endswith("评论")]

tags = ",".join(tag_list)

获取所有页面的需要数据

# -*- coding: utf-8 -*-

import re

import scrapy

from scrapy.http import Request

from urllib import parse

class JobboleSpider(scrapy.Spider):

name = 'jobbole'

allowed_domains = ['blog.jobbole.com']

start_urls = ['http://blog.jobbole.com/all-posts/']

def parse(self, response):

"""

1. 获取文章列表页中的文章url并交给scrapy下载后进行解析

2. 获取下一页的url并交给scrapy进行下载,下载完成后交给parse

"""

# 解析列表页中的所有文章url并交给scrapy下载后进行解析

post_urls = response.css("#archive .floated-thumb .post-thumb a::attr(href)").extract()

for post_url in post_urls:

yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse_detail)

# 提取下一页并交给scrapy下载

next_url = response.css(".next.page-numbers::attr(href)").extract_first("")

if next_url:

yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse)

def parse_detail(self, response):

# 提取文章具体字段

title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first("")

create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract_first().replace("·",

"").strip()

praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract_first("")

fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract_first("")

match_fav_nums = re.match(r".*?(\d+).*?", fav_nums)

if match_fav_nums:

fav_nums = int(match_fav_nums.group(1))

else:

fav_nums = 0

comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract_first("")

match_comment_nums = re.match(r".*?(\d+).*?", comment_nums)

if match_comment_nums:

comment_nums = int(match_comment_nums.group(1))

else:

comment_nums = 0

content = response.xpath("//div[@class='entry']").extract_first("")

tag_list = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract()

tag_list = [element for element in tag_list if not element.strip().endswith("评论")]

tags = ",".join(tag_list)

数据持久化

items.py创建模型

# -*- coding: utf-8 -*-

# Define here the models for your scraped items

#

# See documentation in:

# http://doc.scrapy.org/en/latest/topics/items.html

import datetime

import re

import scrapy

from scrapy.loader.processors import MapCompose, TakeFirst, Join

from scrapy.loader import ItemLoader

class ArticlespiderItem(scrapy.Item):

# define the fields for your item here like:

# name = scrapy.Field()

pass

def date_convert(value):

try:

create_date = datetime.datetime.strptime(value, "%Y/%m/%d").date()

except Exception as e:

create_date = datetime.datetime.now().date()

return create_date

def get_nums(value):

match_nums = re.match(r".*?(\d+).*?", value)

if match_nums:

nums = int(match_nums.group(1))

else:

nums = 0

return nums

def remove_comment_tags(value):

# 去掉tags中提取的评论

if "评论" in value:

return ""

else:

return value

def return_value(value):

return value

class ArticleItemLoader(ItemLoader):

# 自定义itemloader

default_output_processor = TakeFirst()

class JobBoleArticleItem(scrapy.Item):

title = scrapy.Field()

create_date = scrapy.Field(

input_processor=MapCompose(date_convert)

)

url = scrapy.Field()

url_object_id = scrapy.Field()

front_image_url = scrapy.Field(

output_processor=MapCompose(return_value) # 覆盖掉default_output_processor

)

front_image_path = scrapy.Field()

praise_nums = scrapy.Field(

input_processor=MapCompose(get_nums)

)

fav_nums = scrapy.Field(

input_processor=MapCompose(get_nums)

)

comment_nums = scrapy.Field(

input_processor=MapCompose(get_nums)

)

content = scrapy.Field()

tags = scrapy.Field(

input_processor=MapCompose(remove_comment_tags),

output_processor=Join(",")

)

jobbole.py修改代码逻辑

class JobboleSpider(scrapy.Spider):

name = 'jobbole'

allowed_domains = ['blog.jobbole.com']

start_urls = ['http://blog.jobbole.com/all-posts/']

def parse(self, response):

"""

1. 获取文章列表页中的文章url并交给scrapy下载后进行解析

2. 获取下一页的url并交给scrapy进行下载,下载完成后交给parse

"""

# 解析列表页中的所有文章url并交给scrapy下载后进行解析

post_nodes = response.css("#archive .floated-thumb .post-thumb a")

for post_node in post_nodes:

image_url = post_node.css("img::attr(src)").extract_first("")

post_url = post_node.css("::attr(href)").extract_first("")

yield Request(url=parse.urljoin(response.url, post_url), meta={"front_image_url": image_url}, callback=self.parse_detail)

# 提取下一页并交给scrapy下载

next_url = response.css(".next.page-numbers::attr(href)").extract_first("")

if next_url:

yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)

def parse_detail(self, response):

article_item = JobBoleArticleItem()

# 提取图片

front_image_url = response.meta.get("front_image_url", "")

# 通过item_loader加载item

item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)

item_loader.add_value("front_image_url", [front_image_url])

item_loader.add_value("url_object_id", get_md5(response.url))

item_loader.add_value("url", response.url)

item_loader.add_value("front_image_url", response.url)

item_loader.add_xpath("create_date", '//p[@class="entry-meta-hide-on-mobile"]/text()')

item_loader.add_xpath("title", '//div[@class="entry-header"]/h1/text()')

item_loader.add_xpath("praise_nums", "//span[contains(@class, 'vote-post-up')]/h10/text()")

item_loader.add_xpath("fav_nums", "//span[contains(@class, 'bookmark-btn')]/text()")

item_loader.add_xpath("comment_nums", "//a[@href='#article-comment']/span/text()")

item_loader.add_xpath("content", "//div[@class='entry']")

item_loader.add_xpath("tags", '//p[@class="entry-meta-hide-on-mobile"]/a/text()')

article_item = item_loader.load_item()

yield article_item

修改settings.py使pipeline.py生效,并设置图片自动下载

ITEM_PIPELINES = {

'ArticleSpider.pipelines.JsonWithEncodingPipeline': 2,

# 'scrapy.pipelines.images.ImagesPipeline': 1,

'ArticleSpider.pipelines.ArticleImagePipeline': 1,

}

IMAGES_URLS_FIELD = "front_image_url"

project_dir = os.path.abspath(os.path.dirname(__file__))

IMAGES_STORE = os.path.join(project_dir, "images")

设置常用函数get_md5()

# -*- coding: utf-8 -*-

import hashlib

def get_md5(url): # 不接受unicode编码

if isinstance(url, str):

url = url.encode("utf-8")

m = hashlib.md5()

m.update(url)

return m.hexdigest() # 完成md5的摘要生成

修改pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here

#

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

import codecs

import json

from scrapy.pipelines.images import ImagesPipeline

from scrapy.exporters import JsonItemExporter

from twisted.enterprise import adbapi

import MySQLdb

import MySQLdb.cursors

class ArticlespiderPipeline(object):

def process_item(self, item, spider):

return item

class JsonExporterPipeline(object):

# 调用scrapy提供json_export导出json文件

def __init__(self):

self.file = open('articleexport.json', 'wb')

self.exporter = JsonItemExporter(self.file, encoding='utf-8', ensure_ascii=False)

self.exporter.start_exporting()

def close_spider(self, spider):

self.exporter.finish_exporting()

self.file.close()

def process_item(self, item, spider):

self.exporter.export_item(item)

return item

class MysqlTwistedPipeline(object):

def __init__(self, dbpool):

self.dbpool = dbpool

@classmethod

def from_settings(cls, settings):

dbparams = dict(

host=settings['MYSQL_HOST'],

db=settings['MYSQL_DBNAME'],

user=settings['MYSQL_USER'],

passwd=settings['MYSQL_PASSWORD'],

charset='utf8',

cursorclass=MySQLdb.cursors.DictCursor,

use_unicode=True,

)

# adbapi将数据化操作异步化

dbpool = adbapi.ConnectionPool("MySQLdb", **dbparams)

return cls(dbpool) # 实例化

def process_item(self, item, spider):

# 使用twisted将MySQL插入变成异步执行

query = self.dbpool.runInteraction(self.do_insert, item)

query.addErrback(self.handle_error) # 处理异常

def handle_error(self, failure):

# 处理异步插入的异常

print(failure)

def do_insert(self, cursor, item):

# 执行具体的插入

insert_sql = """

insert into jobbole_article(title, url, create_date, fav_nums)

values(%s, %s, %s, %s)

"""

cursor.execute(insert_sql, (item["title"], item["url"], item["create_date"], item["fav_nums"],))

class MysqlPipeline(object):

# 采用同步的机制写入mysql

def __init__(self):

self.conn = MySQLdb.connect('localhost', 'root', '', 'article_spider', charset='utf8', use_unicode=True)

self.cursor = self.conn.cursor()

def process_item(self, item, spider):

insert_sql = """

insert into jobbole_article(title, url, create_date, fav_nums)

values(%s, %s, %s, %s)

"""

self.cursor.execute(insert_sql, (item["title"], item["url"], item["create_date"], item["fav_nums"],))

self.conn.commit()

class JsonWithEncodingPipeline(object):

# 自定义json文件的导出

def __init__(self):

self.file = codecs.open('article.json', 'w', encoding="utf-8")

def process_item(self, item, spider):

lines = json.dumps(dict(item), ensure_ascii=False) + "\n"

self.file.write(lines)

return item

def spider_closed(self, spider):

self.file.closed()

class ArticleImagePipeline(ImagesPipeline):

def item_completed(self, results, item, info):

if "front_image_url" in item:

for ok, value in results:

image_file_path = value['path']

item["front_image_url"] = image_file_path

return item

数据库文件如items.py所示

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值