南燕新闻自动生成软件——scrapy爬虫程序

最新推荐文章于 2023-05-25 10:21:54 发布

zhangqiang1104

最新推荐文章于 2023-05-25 10:21:54 发布

阅读量885

点赞数

分类专栏：工程项目

本文链接：https://blog.csdn.net/zhangqiang1104/article/details/50278037

版权

工程项目专栏收录该内容

2 篇文章 0 订阅

订阅专栏

使用scrapy爬虫框架爬取北京大汇丰商学院的新闻，新闻网址：

1.定义Item.py容器类文件，代码如下：

import scrapy


class PhbsNewsItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    date = scrapy.Field()
    origin = scrapy.Field()
    clicks = scrapy.Field()
    content = scrapy.Field()

2.编写spider爬虫文件：

其中，29-35行代码，通过模拟js的点击操作，获取新闻的点击数；

获取如下信息：“$('#todaydowns').html('4');$('#weekdowns').html('6');$('#monthdowns').html('243');$('#hits').html('271');”

使用split（“\’ ”）[-2]函数通过单引号拆分，并去倒数第二个数字

import scrapy
import re
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from PHBS_News.items import PhbsNewsItem
from bs4 import BeautifulSoup
import urllib2
import sys
reload(sys)
sys.setdefaultencoding('utf8')

class PhbsNewsSpider(CrawlSpider):
    name = 'PhbsNews'
    allowed_domains = ['www.phbs.pku.edu.cn']
    start_urls = ['http://www.phbs.pku.edu.cn/list-419-1.html']

    rules = [Rule(SgmlLinkExtractor(allow=['http://www.phbs.pku.edu.cn/list-419-\d*.html'])),
             Rule(SgmlLinkExtractor(allow=['http://www.phbs.pku.edu.cn/content-419']), 'parse_torrent')]

    def parse_torrent(self, response):
        soup = BeautifulSoup(response.body)
        #print soup
        title = soup.find(attrs={'class':'title'}).string
        #print title
        date = soup.find(attrs={'class':'inputtime'}).string
        #print date
        orgin = soup.find(attrs={'class':'username'}).string
        #print orgin
        clicks_url = soup.find(attrs={'language':'JavaScript'})['src']
        #print clicks_url

		#获取页面点击数，通过打开js调用的连接
        response = urllib2.urlopen(clicks_url)
        clicks = response.read().split('\'')[-2]
        #print clicks
        content = soup.find(attrs={'class':'content'})
        #print content
        #建立数据库时指定编码为utf-8，此处需统一编码
        items = PhbsNewsItem()
        items['title'] = title.encode('utf-8')
        items['date'] = date
        items['origin'] = orgin
        items['clicks'] = clicks
        items['content'] = content
        #print items
        return items

3.编写pipline存储文件：

import MySQLdb
import MySQLdb.cursors
from scrapy import log
import time
import sys
reload(sys)
sys.setdefaultencoding('utf8')

class PhbsNewsPipeline(object):

    #init只执行一次
    def __init__(self):

        self.db = MySQLdb.connect("localhost","root","","opensns", charset = 'utf8')
        self.cursor = self.db.cursor()

        sql_checkDate = "select create_time from news where category=3 order by create_time desc limit 1"
        self.cursor.execute(sql_checkDate)
        result = self.cursor.fetchall()
        if (result):
            self.latest_date = result[0][0]
        else:
            self.latest_date = 0

    #pipeline默认调用
    def process_item(self, item, spider):

        #将datetime转换为时间戳：
        a = item['date'];
        #将其转换为时间数组
        timeArray = time.strptime(a, "%Y-%m-%d %H:%M:%S")
        #转换为时间戳
        timeStamp = int(time.mktime(timeArray))

        if (timeStamp > self.latest_date):

            sql = "INSERT INTO news(uid, title, category, status, view, dead_line, create_time) VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s')" % ('1', item['title'], '3', '1', item['clicks'] ,'2147483640', timeStamp)
            
            try:
               # 执行sql语句
               self.cursor.execute(sql)
               print 'herhehrherhehrherherheh'
               
               print int(self.cursor.lastrowid)

               sql_detail = "INSERT INTO news_detail(news_id, content) VALUES (%d, '%s')" % (int(self.cursor.lastrowid), item['content'])
               self.cursor.execute(sql_detail)

               # 提交到数据库执行
               self.db.commit()
            except:
               # Rollback in case there is any error
               self.db.rollback()

        return item

    def handle_error(self, e):
        log.err(e)

4.最后，不要忘记在setting文件中添加：

BOT_NAME = 'PHBS_News'

SPIDER_MODULES = ['PHBS_News.spiders']
NEWSPIDER_MODULE = 'PHBS_News.spiders'

ITEM_PIPELINES = {  
    #'importantNews.pipelines.ImportantnewsPipeline':300, 
    'PHBS_News.pipelines.PhbsNewsPipeline':300
}