使用scrapy爬虫框架爬取北京大汇丰商学院的新闻,新闻网址:
1.定义Item.py容器类文件,代码如下:
import scrapy
class PhbsNewsItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
date = scrapy.Field()
origin = scrapy.Field()
clicks = scrapy.Field()
content = scrapy.Field()
其中,29-35行代码,通过模拟js的点击操作,获取新闻的点击数;
获取如下信息:“$('#todaydowns').html('4');$('#weekdowns').html('6');$('#monthdowns').html('243');$('#hits').html('271');”
使用split(“\’ ”)[-2]函数通过单引号拆分,并去倒数第二个数字
import scrapy
import re
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from PHBS_News.items import PhbsNewsItem
from bs4 import BeautifulSoup
import urllib2
import sys
reload(sys)
sys.setdefaultencoding('utf8')
class PhbsNewsSpider(CrawlSpider):
name = 'PhbsNews'
allowed_domains = ['www.phbs.pku.edu.cn']
start_urls = ['http://www.phbs.pku.edu.cn/list-419-1.html']
rules = [Rule(SgmlLinkExtractor(allow=['http://www.phbs.pku.edu.cn/list-419-\d*.html'])),
Rule(SgmlLinkExtractor(allow=['http://www.phbs.pku.edu.cn/content-419']), 'parse_torrent')]
def parse_torrent(self, response):
soup = BeautifulSoup(response.body)
#print soup
title = soup.find(attrs={'class':'title'}).string
#print title
date = soup.find(attrs={'class':'inputtime'}).string
#print date
orgin = soup.find(attrs={'class':'username'}).string
#print orgin
clicks_url = soup.find(attrs={'language':'JavaScript'})['src']
#print clicks_url
#获取页面点击数,通过打开js调用的连接
response = urllib2.urlopen(clicks_url)
clicks = response.read().split('\'')[-2]
#print clicks
content = soup.find(attrs={'class':'content'})
#print content
#建立数据库时指定编码为utf-8,此处需统一编码
items = PhbsNewsItem()
items['title'] = title.encode('utf-8')
items['date'] = date
items['origin'] = orgin
items['clicks'] = clicks
items['content'] = content
#print items
return items
import MySQLdb
import MySQLdb.cursors
from scrapy import log
import time
import sys
reload(sys)
sys.setdefaultencoding('utf8')
class PhbsNewsPipeline(object):
#init只执行一次
def __init__(self):
self.db = MySQLdb.connect("localhost","root","","opensns", charset = 'utf8')
self.cursor = self.db.cursor()
sql_checkDate = "select create_time from news where category=3 order by create_time desc limit 1"
self.cursor.execute(sql_checkDate)
result = self.cursor.fetchall()
if (result):
self.latest_date = result[0][0]
else:
self.latest_date = 0
#pipeline默认调用
def process_item(self, item, spider):
#将datetime转换为时间戳:
a = item['date'];
#将其转换为时间数组
timeArray = time.strptime(a, "%Y-%m-%d %H:%M:%S")
#转换为时间戳
timeStamp = int(time.mktime(timeArray))
if (timeStamp > self.latest_date):
sql = "INSERT INTO news(uid, title, category, status, view, dead_line, create_time) VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s')" % ('1', item['title'], '3', '1', item['clicks'] ,'2147483640', timeStamp)
try:
# 执行sql语句
self.cursor.execute(sql)
print 'herhehrherhehrherherheh'
print int(self.cursor.lastrowid)
sql_detail = "INSERT INTO news_detail(news_id, content) VALUES (%d, '%s')" % (int(self.cursor.lastrowid), item['content'])
self.cursor.execute(sql_detail)
# 提交到数据库执行
self.db.commit()
except:
# Rollback in case there is any error
self.db.rollback()
return item
def handle_error(self, e):
log.err(e)
4.最后,不要忘记在setting文件中添加:
BOT_NAME = 'PHBS_News'
SPIDER_MODULES = ['PHBS_News.spiders']
NEWSPIDER_MODULE = 'PHBS_News.spiders'
ITEM_PIPELINES = {
#'importantNews.pipelines.ImportantnewsPipeline':300,
'PHBS_News.pipelines.PhbsNewsPipeline':300
}