import scrapy
import re
from scrapy.http import Request
from urllib import parse
from ArticleSpider.items import ArticelItem
class KuqinSpider(scrapy.Spider):
name = 'kuqin'
allowed_domains = ['www.kuqin.com']
start_urls = ['http://www.kuqin.com']
def parse(self, response):
post_urls = response.css(".list-boxes h2 a::attr(href)").extract()
for post_url in post_urls:
post_url = parse.urljoin(response.url, post_url)
print(post_url)
yield Request(url=post_url, callback=self.parse_detail)
next_url = response.xpath("//div[@class='pagination']/ul/li[10]/a/@href").extract()[0]
if next_url:
next_url = parse.urljoin(response.url, next_url)
print(next_url)
yield Request(url=next_url, callback=self.parse)
def parse_detail(self, response):
article_item = ArticelItem()
title = response.xpath("//div[@class='tc-box first-box article-box']/h2/text()").extract()[0]
create_date = response.xpath("//div[@class='article-infobox']/span/text()").extract()[0]
create_date = re.match("(.*)\s", create_date).group(1).strip()
author = response.xpath("//div[@class='kq__article-power']/p/text()").extract()[1]
content = response.xpath("//div[@id='article_content']").extract()[0]
print(title)
print(create_date)
print(author)
print(content)
article_item["title"] = title
article_item["create_date"] = create_date
article_item["author"] = author
article_item["content"] = content
yield article_item
class ArticelItem(scrapy.Item):
title=scrapy.Field()
create_date=scrapy.Field()
author=scrapy.Field()
content=scrapy.Field()
import MySQLdb
import MySQLdb.cursors
import codecs
import json
from twisted.enterprise import adbapi
from scrapy.exporters import JsonItemExporter
class ArticlespiderPipeline(object):
def process_item(self, item, spider):
return item
class JsonWithEncodingPipeline(object):
def __init__(self):
self.file = codecs.open("acticle.json", "w", encoding="utf-8")
def process_item(self, item, spider):
lines = json.dumps(dict(item), ensure_ascii=False) + "\n"
self.file.write(lines)
return item
def spider_closed(self, spider):
self.file.close()
class JsonExporterPipleline(object):
def __init__(self):
self.file = open("articlexport.json", "wb")
self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False)
self.exporter.start_exporting()
def close_spider(self, spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
class MysqlPipeline(object):
def __init__(self):
self.conn = MySQLdb.connect('127.0.0.1', 'root', 'root', 'article_spider', charset='utf8', use_unicode=True)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
insert_sql = """
insert into article(title,create_date,author,content)
values(%s,%s,%s,%s)
"""
self.cursor.execute(insert_sql, (item["title"], item["create_date"], item["author"], item["content"]))
self.conn.commit()
class MysqlTwistedPipline(object):
def __init__(self, dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls, settings):
dbarms = dict(
host=settings["MYSQL_HOST"],
db=settings["MYSQL_DBNAME"],
user=settings["MYSQL_USER"],
password=settings["MYSQL_PASSWORD"],
charset='utf8',
cursorclass=MySQLdb.cursors.DictCursor,
use_unicode=True,
)
dbpool = adbapi.ConnectionPool("MySQLdb", **dbarms)
return cls(dbpool)
def process_item(self, item, spider):
query = self.dbpool.runInteraction(self.do_insert, item)
query.addErrback(self.handle_error, item, spider)
def handle_error(self, failure, item, spider):
print(failure)
def do_insert(self, cursor, item):
insert_sql = """
insert into article(title,create_date,author,content)
values(%s,%s,%s,%s)
"""
cursor.execute(insert_sql, (item["title"], item["create_date"], item["author"], item["content"]))
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
'ArticleSpider.pipelines.JsonExporterPipleline': 200,
}
MYSQL_HOST='127.0.0.1'
MYSQL_DBNAME="article_spider"
MYSQL_USER='root'
MYSQL_PASSWORD='root'
项目链接