基于python Scrapy的爬虫——爬取某网站新闻内容

最新推荐文章于 2024-02-29 10:38:03 发布

werben

最新推荐文章于 2024-02-29 10:38:03 发布

阅读量9.6k

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/humanbeng/article/details/76194764

版权

python 专栏收录该内容

5 篇文章 0 订阅

订阅专栏

【完整源码】https://github.com/beng0305/ThirtySixSpider
【环境】python 2.7 scrapy 1.4 PhantomJS Pyodbc sqlserver 2008

数据库用的pyodbc + sqlserver，pyodbc的编码问题花了很长时间才解决。

采用PhantomJS 来获取js动态内容，虽然速度会相当慢，但是也是windows系统上不得已的选择。

网上谈到的方式也是五花八门，尝试了用scrapy-splash，据说速度可以，但是splash是基于docker容器的，windows上安装docker，问题层出不穷，最后还是放弃了。

这个爬虫速度是慢了一点，初步爬取300个新闻内容大概需要20多分钟，但还算稳定。

【效果】

【源码】

主类ThirtySixSpider.py

# -*- coding: utf-8 -*-
# Author:   BinBin
# Email:    289594665@qq.com
# Time :    2017/07/27

import urllib2
import sys

import re
import scrapy
import logging

from pip._vendor.requests.packages import chardet
from scrapy.http import HtmlResponse
from scrapy.selector import HtmlXPathSelector
from ..items import ArticleItem
from scrapy import Request
import logging

reload(sys)
sys.setdefaultencoding("utf-8")

class ThirtySixSpider(scrapy.Spider):
    name = "ThirtySix"
    allowed_domains = ["36kr.com"]
    start_urls = ['http://36kr.com']

    def parse(self, response):
        print "url:" + response.url
        print "response:" + response.__str__()

        links = response.xpath('//a[contains(@href, "/p/")]//@href').extract()

        #1、获取类似http://36kr.com/p/5055572.html这样的链接
        newsFullLinks = response.xpath('//a[re:test(@href, ".+(/p/\d+\.html)$")]/@href').extract()
        for link in newsFullLinks:
            yield Request(link, callback=self.parse_item)

        #2、获取类似/p/5084179.html这样的链接
        newsIncompleteLinks = response.xpath('//a[re:test(@href, "^(/p/\d+\.html)$")]/@href').extract()
        for link in newsIncompleteLinks:
            link = response.urljoin(link)
            #print link
            yield Request(link, callback=self.parse_item)

        # 3、获取/tags/***、/user/***、/topics/****这样的链接
        otherIncompleteLinks = response.xpath('//a[re:test(@href, "(^/tags/|^/user/|^/topics/).*")]/@href').extract()
        for link in otherIncompleteLinks:
            link = response.urljoin(link)
            #print link
            yield Request(link, callback=self.parse_next)

        # 3、获取http://36kr.com/tags/***、http://36kr.com/user/***、http://36kr.com/topics/****这样的链接
        otherFullLinks = response.xpath('//a[re:test(@href, "(^.+/tags/|^.+/user/|^.+/topics/).*")]/@href').extract()
        for link in otherFullLinks:
            #print link
            yield Request(link, callback=self.parse_next)

    #爬去下一个页面
    def parse_next(self, response):
        links = response.xpath('//a[contains(@href, "/p/")]//@href').extract()

        # 1、获取类似http://36kr.com/p/5055572.html这样的链接
        newsFullLinks = response.xpath('//a[re:test(@href, ".+(/p/\d+\.html)$")]/@href').extract()

        for link in newsFullLinks:
            yield Request(link, callback=self.parse_item)

        # 2、获取类似/p/5084179.html这样的链接
        newsIncompleteLinks = response.xpath('//a[re:test(@href, "^(/p/\d+\.html)$")]/@href').extract()
        for link in newsIncompleteLinks:
            link = response.urljoin(link)
            print link
            yield Request(link, callback=self.parse_item)

        # 3、获取/tags/***、/user/***、/topics/****这样的链接
        otherIncompleteLinks = response.xpath('//a[re:test(@href, "(^/tags/|^/user/|^/topics/).*")]/@href').extract()
        for link in otherIncompleteLinks:
            link = response.urljoin(link)
            #print link
            yield Request(link, callback=self.parse_next)

        # 3、获取http://36kr.com/tags/***、http://36kr.com/user/***、http://36kr.com/topics/****这样的链接
        otherFullLinks = response.xpath('//a[re:test(@href, "(^.+/tags/|^.+/user/|^.+/topics/).*")]/@href').extract()
        for link in otherFullLinks:
            #print link
            yield Request(link, callback=self.parse_next)

    #分析新闻内容
    def parse_item(self, response):

        print "parse_item url:" + response.url
        item = ArticleItem()

        article_titles = response.xpath('//div[re:test(@id, "J_post_wrapper_.*")]/div[1]/h1/text()').extract()
        if (article_titles.count > 0):
            print "article_title:" + article_titles[0]
            item["article_title"] = article_titles[0]

        article_authors = response.xpath('//div[re:test(@id, "J_post_wrapper_.*")]/div[1]/div[1]/div[contains(@class, "author-panel")]/div[contains(@class, "author")]/a/span/text()').extract()
        if (article_authors.count > 0):
            print "article_author:" + article_authors[0]
            item["article_author"] = article_authors[0]

        article_summarys = response.xpath('//div[re:test(@id, "J_post_wrapper_.*")]/div[1]/div[1]/section[@class="summary"]/text()').extract()
        print "article_summarys:" + article_summarys.__str__()
        if (article_summarys.count > 0):
            print "article_summary:" + article_summarys[0]
            item["article_summary"] = article_summarys[0]

        article_icons = response.xpath('//div[re:test(@id, "J_post_wrapper_.*")]/div[1]/div[1]/section[@class="headimg"]/img/@src').extract()
        print "article_icons:" + article_icons.__str__()
        if (article_icons.count > 0):
            print "article_icon:" + article_icons[0]
            item["article_icon"] = article_icons[0]

        article_contents = response.xpath('//div[re:test(@id, "J_post_wrapper_.*")]/div[1]/div[1]/div[2]/section').extract()
        print "article_contents:" + article_contents.__str__()
        if (article_contents.count > 0):
            print "article_content:" + article_contents[0]
            item["article_content"] = article_contents[0]

        item["article_url"] = response.url
        if (item["article_title"] is not None):
             yield item

数据库封装DBHelper.py

# -*- coding: utf-8 -*-
# Author:   BinBin
# Email:    289594665@qq.com
# Time :    2017/07/27

import pyodbc

class DBHelper(object):

    def __init__(self, serverIp, port, dbName, uid, pwd):
        conn_info = 'DRIVER={SQL Server};DATABASE=%s;SERVER=%s,%s;UID=%s;PWD=%s' % (dbName, serverIp, port, uid, pwd)
        self.connection = pyodbc.connect(conn_info, unicode_results=True)
        self.cursor = self.connection.cursor()

    def __del__(self):
        if self.cursor:
            self.cursor.close()
            self.cursor = None
            print(self.cursor, '__del__ cursor closed')
        if self.connection:
            self.connection.close()
            self.connection = None

    def destroy(self):
        if self.cursor:
            print(self.cursor, 'destroy cursor closed')
            self.cursor.close()
            self.cursor = None
        if self.connection:
            self.connection.close()
            self.connection = None

    # 获取全部查询结果
    def queryAll(self, qryStr):
        print(qryStr.decode('gbk'))
        self.cursor.execute(qryStr)
        return self.cursor.fetchall()

    # 获取前maxcnt条查询结果
    def querySome(self, qryStr, maxCount):
        self.cursor.execute(qryStr)
        return self.cursor.fetchmany(maxCount)

    #获取分页查询结果
    def queryPage(self, qryStr, skipCnt, pageSize):
        self.cursor.execute(qryStr)
        self.cursor.skip(skipCnt)
        return self.cursor.fetchmany(pageSize)

    #获取查询条数
    def count(self, sql):
        self.cursor.execute(sql)
        return self.cursor.fetchone()[0]

    #执行语句，包括增删改，返回变更数据数量
    def execute(self, sql):
        count = self.cursor.execute(sql).rowcount
        self.connection.commit()
        return count

新闻信息类items.py

# -*- coding: utf-8 -*-
# Author:   BinBin
# Email:    289594665@qq.com
# Time :    2017/07/27

import scrapy
from scrapy import Field

class ArticleItem(scrapy.Item):
    article_title = Field()
    article_author = Field()
    article_src = Field()
    article_url = Field()
    article_type = Field()
    article_content = Field()
    article_summary = Field()
    article_icon = Field()
    article_time = Field()

中间件middlewares.py

# -*- coding: utf-8 -*-
# Author:   BinBin
# Email:    289594665@qq.com
# Time :    2017/07/27

import string
from DBHelper import DBHelper

class ThirtySixPipeline(object):

    def __init__(self):
        self.helper = DBHelper('120.*.215.*', '1433', 'TestForBinBin', 'sa', '******')

    def process_item(self, item, spider):
        print "process_item title" +  item["article_title"]

        #插入数据库的sql语句
        sql = u'insert into T_Article(article_title, article_author, article_url, article_content, article_summary, article_icon) values (\'{t}\',\'{a}\',\'{u}\',\'{c}\',\'{s}\',\'{i}\')'\
            .format(
                t = item["article_title"],
                a = item["article_author"],
                u = item["article_url"],
                c = item["article_content"],
                s = item["article_summary"],
                i = item["article_icon"]
            )
        #这里要特殊处理这个\xa0，是空格，GBK无法转化这个编码
        sql.replace(u'\xa0', u' ')
        row = self.helper.execute(sql.encode('GBK', 'ignore'))
        return item

数据处理pipelines.py

# -*- coding: utf-8 -*-
# Author:   BinBin
# Email:    289594665@qq.com
# Time :    2017/07/27

import string
from DBHelper import DBHelper

class ThirtySixPipeline(object):

    def __init__(self):
        self.helper = DBHelper('120.*.215.*', '1433', 'TestForBinBin', 'sa', '******')

    def process_item(self, item, spider):
        print "process_item title" +  item["article_title"]

        #插入数据库的sql语句
        sql = u'insert into T_Article(article_title, article_author, article_url, article_content, article_summary, article_icon) values (\'{t}\',\'{a}\',\'{u}\',\'{c}\',\'{s}\',\'{i}\')'\
            .format(
                t = item["article_title"],
                a = item["article_author"],
                u = item["article_url"],
                c = item["article_content"],
                s = item["article_summary"],
                i = item["article_icon"]
            )
        #这里要特殊处理这个\xa0，是空格，GBK无法转化这个编码
        sql.replace(u'\xa0', u' ')
        row = self.helper.execute(sql.encode('GBK', 'ignore'))
        return item

werben

关注

0
点赞
踩
19

收藏

觉得还不错? 一键收藏
0
评论
基于python Scrapy的爬虫——爬取某网站新闻内容

【完整源码】https://github.com/beng0305/ThirtySixSpider【环境】python 2.7 scrapy 1.4 PhantomJS Pyodbc sqlserver 2008采用PhantomJS 来获取js动态内容，虽然速度会相当慢，但是也是windows系统上不得已的选择。网上谈到的方式也是五花八门，尝试了用scrapy-splash，据说速度
复制链接

扫一扫