import scrapy
import re
from urllib import parse
from scrapy.http import Request
from Article_spider.Article_spider.items import duwenzhangSpiderItem,ArticalItemLoader
import datetime
from Article_spider.Article_spider.utils.common import get_md5
from scrapy.loader import ItemLoader
class DuwenzhangSpider(scrapy.Spider):
name = 'duwenzhang'
# allowed_domains = ['http://www.duwenzhang.com/']
start_urls = ['http://www.duwenzhang.com/wenzhang/aiqingwenzhang/']
def parse(self, response):
**#解析每页文章url并交给scrapy下载**
post_urls=response.xpath("//b/a/@href").extract()
for post_url in post_urls:
regex_str = "(http://.*\d.html)"
match_obj = re.match(regex_str, post_url)
if match_obj:
list_url=match_obj.group(1)
yield Request(url=list_url,callback=self.parse_detail)
**#提取下一页并交给scrapy下载**
# next_url = response.xpath("//a[15]/@href").extract_first()
# if next_url:
# yield Request(url=parse.urljoin(response.url,next_url),callback=self.parse)
for i in range(2,200):
next_url="http://www.duwenzhang.com/wenzhang/aiqingwenzhang/list_1_%s.html"%i
if next_url:
yield Request(url=parse.urljoin(response.url,next_url),callback=self.parse)
pass
def parse_detail(self, response):
article_item=duwenzhangSpiderItem()
# 文章标题
title = response.xpath("//h1/text()").extract()[0]
# 文章所属
belong = response.xpath("//td[contains(@class,'pindao')]//a/text()").extract()
belong = " ".join(belong)
# 创建时间
create_time = response.xpath("//tr[2]/td/text()[2]").extract()[1].replace("\r\n", "").strip()
regex_str = ".*时间:(\d{4}-\d{2}-\d{2} \d{2}:\d{2})"
match_obj = re.match(regex_str, create_time)
if match_obj:
create_time = match_obj.group(1)
# # 文章内容
article_plot = response.xpath("//div[@id='wenzhangziti']/p//text()").extract()
article_plot = "".join(article_plot).replace("\u3000", "").replace("\xa0", "")
article_item["title"] = title
article_item["belong"] = belong
try:
create_time = datetime.datetime.strptime(create_time, "%Y-%m-%d %H:%M").date()
except Exception as e:
create_time = datetime.datetime.now().date()
article_item["create_time"] = create_time
article_item["article_plot"] = article_plot
article_item["url"] = response.url
article_item["url_obj_id"]=get_md5(response.url)
yield article_item
pass
items.py
import re
import scrapy
from scrapy.loader.processors import MapCompose,TakeFirst,Join
from scrapy.loader import ItemLoader
import datetime
class ArticleSpiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
class ArticalItemLoader(ItemLoader):
#自定义ItemLoader
default_output_processor = TakeFirst()
class duwenzhangSpiderItem(scrapy.Item):
title = scrapy.Field()
belong = scrapy.Field()
create_time = scrapy.Field()
article_plot = scrapy.Field()
url=scrapy.Field()
url_obj_id=scrapy.Field()
pipelines.py
import codecs
import json
from scrapy.exporters import JsonItemExporter
import MySQLdb
import MySQLdb.cursors
from twisted.enterprise import adbapi
class ArticleSpiderPipeline(object):
def process_item(self, item, spider):
return item
class JsonwithEncodingPipeline(object):
#自定义Json文件的导出
def __init__(self):
self.file=codecs.open("article.json","w",encoding="utf-8")
def process_item(self, item, spider):
lines=json.dumps(dict(item),ensure_ascii=False)+'\n'
self.file.write(lines)
return item
def spider_closed(self,spider):
self.file.close()
class JsonExporterPipeline(object):
#调用scrapy提供的json export到处json文件
def __init__(self):
self.file = open('articleExport.json','wb')
self.exporter=JsonItemExporter(self.file,encoding="utf-8",ensure_ascii=False)
self.exporter.start_exporting()
def close_spider(self,spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
class MysqlPipeline(object):
#采用同步的机制
def __init__(self):
self.conn=MySQLdb.connect('127.0.0.1','root','root','article_spider',charset="utf8",use_unicode=True)
self.cursor=self.conn.cursor()
def process_item(self, item, spider):
insert_sql="""
insert into duwenzhang(title,create_time,url,url_object_id,belong,article_plot)
VALUES (%s,%s,%s,%s,%s,%s)
"""
self.cursor.execute(insert_sql,(item["title"],item["create_time"],item["url"],item["url_obj_id"],item["belong"],item["article_plot"]))
self.conn.commit()
return item
class MysqlTwistedPipeline(object):
def __init__(self,dbpool):
self.dbpool=dbpool
#异步
@classmethod
def from_settings(cls,settings):
dbparms=dict(
host=settings['MYSQL_HOST'],
db=settings['MYSQL_DBNAME'],
user=settings['MYSQL_USER'],
passwd=settings['MYSQL_PASSWD'],
charset='utf8',
cursorclass=MySQLdb.cursors.DictCursor,
use_unicode=True,
)
dbpool=adbapi.ConnectionPool("MySQLdb",**dbparms)
return cls(dbpool)
def process_item(self, item, spider):
#使用Twisted将mysql插入变成异步执行
query=self.dbpool.runInteraction(self.do_insert,item)
query.addErrback(self.handle_error) #处理异常
def handle_error(self,failure):
#处理异步插入异常
print(failure)
def do_insert(self,cursor,item):
#执行具体的插入
insert_sql = """
insert into duwenzhang(title,create_time,url,url_object_id,belong,article_plot)
VALUES (%s,%s,%s,%s,%s,%s)
"""
cursor.execute(insert_sql, (
item["title"], item["create_time"], item["url"], item["url_obj_id"], item["belong"], item["article_plot"]))