【完整源码】https://github.com/beng0305/ThirtySixSpider
【环境】python 2.7 scrapy 1.4 PhantomJS Pyodbc sqlserver 2008
数据库用的pyodbc + sqlserver,pyodbc的编码问题花了很长时间才解决。
采用PhantomJS 来获取js动态内容,虽然速度会相当慢,但是也是windows系统上不得已的选择。
网上谈到的方式也是五花八门,尝试了用scrapy-splash,据说速度可以,但是splash是基于docker容器的,windows上安装docker,问题层出不穷,最后还是放弃了。
这个爬虫速度是慢了一点,初步爬取300个新闻内容大概需要20多分钟,但还算稳定。
【效果】
【源码】
主类ThirtySixSpider.py
# -*- coding: utf-8 -*-
# Author: BinBin
# Email: 289594665@qq.com
# Time : 2017/07/27
import urllib2
import sys
import re
import scrapy
import logging
from pip._vendor.requests.packages import chardet
from scrapy.http import HtmlResponse
from scrapy.selector import HtmlXPathSelector
from ..items import ArticleItem
from scrapy import Request
import logging
reload(sys)
sys.setdefaultencoding("utf-8")
class ThirtySixSpider(scrapy.Spider):
name = "ThirtySix"
allowed_domains = ["36kr.com"]
start_urls = ['http://36kr.com']
def parse(self, response):
print "url:" + response.url
print "response:" + response.__str__()
links = response.xpath('//a[contains(@href, "/p/")]//@href').extract()
#1、获取类似http://36kr.com/p/5055572.html这样的链接
newsFullLinks = response.xpath('//a[re:test(@href, ".+(/p/\d+\.html)$")]/@href').extract()
for link in newsFullLinks:
yield Request(link, callback=self.parse_item)
#2、获取类似/p/5084179.html这样的链接
newsIncompleteLinks = response.xpath('//a[re:test(@href, "^(/p/\d+\.html)$")]/@href').extract()
for link in newsIncompleteLinks:
link = response.urljoin(link)
#print link
yield Request(link, callback=self.parse_item)
# 3、获取/tags/***、/user/***、/topics/****这样的链接
otherIncompleteLinks = response.xpath('//a[re:test(@href, "(^/tags/|^/user/|^/topics/).*")]/@href').extract()
for link in otherIncompleteLinks:
link = response.urljoin(link)
#print link
yield Request(link, callback=self.parse_next)
# 3、获取http://36kr.com/tags/***、http://36kr.com/user/***、http://36kr.com/topics/****这样的链接
otherFullLinks = response.xpath('//a[re:test(@href, "(^.+/tags/|^.+/user/|^.+/topics/).*")]/@href').extract()
for link in otherFullLinks:
#print link
yield Request(link, callback=self.parse_next)
#爬去下一个页面
def parse_next(self, response):
links = response.xpath('//a[contains(@href, "/p/")]//@href').extract()
# 1、获取类似http://36kr.com/p/5055572.html这样的链接
newsFullLinks = response.xpath('//a[re:test(@href, ".+(/p/\d+\.html)$")]/@href').extract()
for link in newsFullLinks:
yield Request(link, callback=self.parse_item)
# 2、获取类似/p/5084179.html这样的链接
newsIncompleteLinks = response.xpath('//a[re:test(@href, "^(/p/\d+\.html)$")]/@href').extract()
for link in newsIncompleteLinks:
link = response.urljoin(link)
print link
yield Request(link, callback=self.parse_item)
# 3、获取/tags/***、/user/***、/topics/****这样的链接
otherIncompleteLinks = response.xpath('//a[re:test(@href, "(^/tags/|^/user/|^/topics/).*")]/@href').extract()
for link in otherIncompleteLinks:
link = response.urljoin(link)
#print link
yield Request(link, callback=self.parse_next)
# 3、获取http://36kr.com/tags/***、http://36kr.com/user/***、http://36kr.com/topics/****这样的链接
otherFullLinks = response.xpath('//a[re:test(@href, "(^.+/tags/|^.+/user/|^.+/topics/).*")]/@href').extract()
for link in otherFullLinks:
#print link
yield Request(link, callback=self.parse_next)
#分析新闻内容
def parse_item(self, response):
print "parse_item url:" + response.url
item = ArticleItem()
article_titles = response.xpath('//div[re:test(@id, "J_post_wrapper_.*")]/div[1]/h1/text()').extract()
if (article_titles.count > 0):
print "article_title:" + article_titles[0]
item["article_title"] = article_titles[0]
article_authors = response.xpath('//div[re:test(@id, "J_post_wrapper_.*")]/div[1]/div[1]/div[contains(@class, "author-panel")]/div[contains(@class, "author")]/a/span/text()').extract()
if (article_authors.count > 0):
print "article_author:" + article_authors[0]
item["article_author"] = article_authors[0]
article_summarys = response.xpath('//div[re:test(@id, "J_post_wrapper_.*")]/div[1]/div[1]/section[@class="summary"]/text()').extract()
print "article_summarys:" + article_summarys.__str__()
if (article_summarys.count > 0):
print "article_summary:" + article_summarys[0]
item["article_summary"] = article_summarys[0]
article_icons = response.xpath('//div[re:test(@id, "J_post_wrapper_.*")]/div[1]/div[1]/section[@class="headimg"]/img/@src').extract()
print "article_icons:" + article_icons.__str__()
if (article_icons.count > 0):
print "article_icon:" + article_icons[0]
item["article_icon"] = article_icons[0]
article_contents = response.xpath('//div[re:test(@id, "J_post_wrapper_.*")]/div[1]/div[1]/div[2]/section').extract()
print "article_contents:" + article_contents.__str__()
if (article_contents.count > 0):
print "article_content:" + article_contents[0]
item["article_content"] = article_contents[0]
item["article_url"] = response.url
if (item["article_title"] is not None):
yield item
数据库封装DBHelper.py
# -*- coding: utf-8 -*-
# Author: BinBin
# Email: 289594665@qq.com
# Time : 2017/07/27
import pyodbc
class DBHelper(object):
def __init__(self, serverIp, port, dbName, uid, pwd):
conn_info = 'DRIVER={SQL Server};DATABASE=%s;SERVER=%s,%s;UID=%s;PWD=%s' % (dbName, serverIp, port, uid, pwd)
self.connection = pyodbc.connect(conn_info, unicode_results=True)
self.cursor = self.connection.cursor()
def __del__(self):
if self.cursor:
self.cursor.close()
self.cursor = None
print(self.cursor, '__del__ cursor closed')
if self.connection:
self.connection.close()
self.connection = None
def destroy(self):
if self.cursor:
print(self.cursor, 'destroy cursor closed')
self.cursor.close()
self.cursor = None
if self.connection:
self.connection.close()
self.connection = None
# 获取全部查询结果
def queryAll(self, qryStr):
print(qryStr.decode('gbk'))
self.cursor.execute(qryStr)
return self.cursor.fetchall()
# 获取前maxcnt条查询结果
def querySome(self, qryStr, maxCount):
self.cursor.execute(qryStr)
return self.cursor.fetchmany(maxCount)
#获取分页查询结果
def queryPage(self, qryStr, skipCnt, pageSize):
self.cursor.execute(qryStr)
self.cursor.skip(skipCnt)
return self.cursor.fetchmany(pageSize)
#获取查询条数
def count(self, sql):
self.cursor.execute(sql)
return self.cursor.fetchone()[0]
#执行语句,包括增删改,返回变更数据数量
def execute(self, sql):
count = self.cursor.execute(sql).rowcount
self.connection.commit()
return count
新闻信息类items.py
# -*- coding: utf-8 -*-
# Author: BinBin
# Email: 289594665@qq.com
# Time : 2017/07/27
import scrapy
from scrapy import Field
class ArticleItem(scrapy.Item):
article_title = Field()
article_author = Field()
article_src = Field()
article_url = Field()
article_type = Field()
article_content = Field()
article_summary = Field()
article_icon = Field()
article_time = Field()
中间件middlewares.py
# -*- coding: utf-8 -*-
# Author: BinBin
# Email: 289594665@qq.com
# Time : 2017/07/27
import string
from DBHelper import DBHelper
class ThirtySixPipeline(object):
def __init__(self):
self.helper = DBHelper('120.*.215.*', '1433', 'TestForBinBin', 'sa', '******')
def process_item(self, item, spider):
print "process_item title" + item["article_title"]
#插入数据库的sql语句
sql = u'insert into T_Article(article_title, article_author, article_url, article_content, article_summary, article_icon) values (\'{t}\',\'{a}\',\'{u}\',\'{c}\',\'{s}\',\'{i}\')'\
.format(
t = item["article_title"],
a = item["article_author"],
u = item["article_url"],
c = item["article_content"],
s = item["article_summary"],
i = item["article_icon"]
)
#这里要特殊处理这个\xa0,是空格,GBK无法转化这个编码
sql.replace(u'\xa0', u' ')
row = self.helper.execute(sql.encode('GBK', 'ignore'))
return item
数据处理pipelines.py
# -*- coding: utf-8 -*-
# Author: BinBin
# Email: 289594665@qq.com
# Time : 2017/07/27
import string
from DBHelper import DBHelper
class ThirtySixPipeline(object):
def __init__(self):
self.helper = DBHelper('120.*.215.*', '1433', 'TestForBinBin', 'sa', '******')
def process_item(self, item, spider):
print "process_item title" + item["article_title"]
#插入数据库的sql语句
sql = u'insert into T_Article(article_title, article_author, article_url, article_content, article_summary, article_icon) values (\'{t}\',\'{a}\',\'{u}\',\'{c}\',\'{s}\',\'{i}\')'\
.format(
t = item["article_title"],
a = item["article_author"],
u = item["article_url"],
c = item["article_content"],
s = item["article_summary"],
i = item["article_icon"]
)
#这里要特殊处理这个\xa0,是空格,GBK无法转化这个编码
sql.replace(u'\xa0', u' ')
row = self.helper.execute(sql.encode('GBK', 'ignore'))
return item