spider
import logging
import scrapy
logger = logging.getLogger(__name__)
class CqwenzhengSpider(scrapy.Spider):
name = 'cqWenZheng'
allowed_domains = ['cqnews.net']
again_url = "http://cqwz.cqnews.net/ask/searchResult?status=1&lastId="
again_detail_url = "http://cqwz.cqnews.net/ask/askDetail?id="
def start_requests(self):
urls = [
'http://cqwz.cqnews.net/ask/searchResult?status=1'
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse, method="POST", dont_filter=False)
def parse(self, response):
assert isinstance(response, scrapy.http.response.text.TextResponse)
json_data = response.json()
if json_data.get("data") is None:
logger.warning("运行结束")
return
last_id = json_data["data"]["dataList"][-1]["id"]
url = CqwenzhengSpider.again_url + str(last_id)
yield scrapy.Request(url=url, callback=self.parse, method="POST", dont_filter=False)
for i in json_data["data"]["dataList"]:
i["detail_url"] = CqwenzhengSpider.again_detail_url + str(i["id"])
logger.warning(i)
yield i
pilelines
import pymongo
from pymongo import MongoClient
from pymongo.errors import DuplicateKeyError
class CqWenzhengPipeline:
def __init__(self):
mongoClient = MongoClient(host="123", port=222)
mongoClient.test.authenticate("test", "test")
self.mongo_data_base = mongoClient["test"]
self.mongo_data_base.drop_collection("cqWenZheng")
self.mongo_data_base["cqWenZheng"].create_index([("id", pymongo.ASCENDING)], unique=True)
self.mongo_data_base["cqWenZheng"].create_index([("title", pymongo.ASCENDING)])
def process_item(self, item, spider):
try:
self.mongo_data_base["cqWenZheng"].insert_one(item)
except DuplicateKeyError as a:
print("重复建:{}".format(item["id"]))
return item