scrapy爬虫(3)之连接数据库之前的代码

1、setting.py

# -*- coding: utf-8 -*-

import os

BOT_NAME = 'ArticleSpider'

SPIDER_MODULES = ['ArticleSpider.spiders']
NEWSPIDER_MODULE = 'ArticleSpider.spiders'

ROBOTSTXT_OBEY = False  ## true的话会过滤掉不符合协议的url

ITEM_PIPELINES = {  # 打开
    'ArticleSpider.pipelines.ArticlespiderPipeline': 300,
    #scrapy.pipelines.images.ImagesPipeline': 1,  # 下载图片,数字越小越早进入该管道
    'ArticleSpider.pipelines.ArticleImagePipeline': 1,
}

IMAGES_URLS_FIELD = "front_image_url"
project_dir = os.path.abspath(os.path.dirname(__file__))
IMAGES_STORE = os.path.join(project_dir, 'images')

2、items.py

# -*- coding: utf-8 -*-

import scrapy

class ArticlespiderItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass

class JobBoleArticleItem(scrapy.Item):
    title = scrapy.Field()
    create_data = scrapy.Field()
    url = scrapy.Field()
    url_boject_id = scrapy.Field()
    front_image_url = scrapy.Field()
    front_image_path = scrapy.Field()
    praise_nums = scrapy.Field()
    favor_nums = scrapy.Field()
    comment_nums = scrapy.Field()
    tags = scrapy.Field()
    content = scrapy.Field()

3、pipeline.py

# -*- coding: utf-8 -*-

from scrapy.pipelines.images import ImagesPipeline

class ArticlespiderPipeline(object):
    def process_item(self, item, spider):
        return item

class ArticleImagePipeline(ImagesPipeline):
    def item_completed(self, results, item, info):
        for ok, value in results:  # 保存图片路径
            image_file_path = value["path"]
        item["front_image_path"] = image_file_path

        return item
        #pass

4、jobbole.py

# -*- coding: utf-8 -*-
import scrapy
import re
from scrapy.http import Request
import urlparse
#from urllib import parse  # py3
from ArticleSpider.items import JobBoleArticleItem
from ArticleSpider.utils.common import get_md5

import sys
reload(sys)
sys.setdefaultencoding('utf-8')  # 消除UnicodeDecodeError: 'ascii' codec can't decode byte 0xe8 in position 0

class JobboleSpider(scrapy.Spider):
    name = 'jobbole'
    allowed_domains = ['blog.jobbole.com']
    # start_urls = ['http://blog.jobbole.com/112558/']
    start_urls = ['http://blog.jobbole.com/all-posts/']

    def parse(self, response):
        """
        1.获取文章列表页中的文章url并交给scrapy下载后并进行解析
        2.获取下一页的url并交给scrapy进行下载,下载完成后交给parse
        """

        # 解析列表页中的所有文章url并交给scrapy下载后并进行解析:
        #post_urls = response.css("#archive .floated-thumb .post-thumb a::attr(href)").extract()  # 取href的属性
        post_nodes = response.css("#archive .floated-thumb .post-thumb a")
        #for post_url in post_urls:
        for post_node in post_nodes:
            image_url = post_node.css("img::attr(src)").extract_first("")
            post_url = post_node.css("::attr(href)").extract_first("")
            yield Request(url=urlparse.urljoin(response.url, post_url), meta={"front_image_url":image_url}, callback=self.parse_detail)
            #yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse_detail)  # py3
            print(post_url)

        # 获取下一页的url并交给scrapy进行下载
        next_url = response.css(".next.page-numbers::attr(href)").extract_first("")
        if next_url:
            yield Request(url=urlparse.urljoin(response.url, post_url), callback=self.parse)

    def parse_detail(self, response):
        article_item = JobBoleArticleItem()

        # 提取文章的具体字段
        # css:
        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        title2 = response.css(".entry-header h1::text").extract()[0]
        create_data2 = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace('·', '')
        praise_nums2 = int(response.css(".href-style h10::text").extract()[0])
        favor_nums2 = response.css(".bookmark-btn::text").extract()[0]
        match_re3 = re.match(".*?(\d+).*", favor_nums2)
        if match_re3:
            favor_nums2 = int(match_re3.group(1))
        else:
            favor_nums2 = 0
        comment_nums2 = response.css("span.hide-on-480::text").extract()[0]
        match_re4 = re.match(".*?(\d+).*", comment_nums2)
        if match_re4:
            comment_nums2 = int(match_re4.group(1))
        else:
            comment_nums2 = 0
        content2 = response.css('div.entry').extract()[0]
        tag_list2 = response.css("p.entry-meta-hide-on-mobile a::text").extract()
        [element for element in tag_list2 if not element.strip().endswith("评论")]
        tags2 = ",".join(tag_list2)

        article_item["url_object_id"] = get_md5(response.url)
        article_item["title"] = title2
        article_item["url"] = response.url
        article_item["create_data"] = create_data2
        article_item["front_image_url"] = [front_image_url]  # 数组格式
        article_item["praise_nums"] = praise_nums2
        article_item["favor_nums"] = favor_nums2
        article_item["comment_nums"] = comment_nums2
        article_item["tags"] = tags2
        article_item["content"] = content2

        yield article_item  # 传递到pipeline

5、新建common.py

import hashlib

def get_md5(url):
    if isinstance(url, str):
        url = url.encode("utf-8")
    m = hashlib.md5()
    m.update(url)
    return m.hexdigest()

if __name__ == "__main__":
    print(get_md5("http://jobbole.com"))
    #print(get_md5("http://jobbole.com".encode("utf-8")))  # py3

6、新建main.py

# coding:utf-8

from scrapy.cmdline import execute
import sys
import os

sys.path.append(os.path.dirname(os.path.abspath(__file__)))
#print(os.path.abspath(__file__))  # 自身路径
#print(os.path.dirname(os.path.abspath(__file__)))  # 父目录路径

execute(["scrapy", "crawl", "jobbole"])
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值