1.爬虫文件
import scrapy
from ..items import NewsdataItem
from urllib import parse
class NewsSpider(scrapy.Spider):
name = 'news'
allowed_domains = ['www.lunwenchina.cn']
start_urls = ['http://www.lunwenchina.cn/html/11/category-catid-11.html']
def parse(self, response):
print(response.text)
print("#########################3")
total = response.xpath('//ul[@class="global_tx_list4"]/li')
# print(total)
for b in total:
item = NewsdataItem()
title = b.xpath('./a/text()').extract()
lianjie = b.xpath('./a/@href').extract()
date=b.xpath('./span/text()').extract()
print("*********************")
# print(lianjie)
title =str(title)
lianjie=str(lianjie)
date=str(date)
item['title'] = title
item['lianjie'] = lianjie
item['date']=date
print("+++++++++++++++++++++")
yield item
# next_page_url=response.xpath('//a[class="next"]/@href').extract()
# print("::::::::::::::::::::")
# print(next_page_url)
# print("?????????????????")
# if next_page_url:
# yield scrapy.Request(url=response.urljoin(next_page_url),callback=self.parse)
# print("%%%%%%%%%%%%%%%%%%%%%%%%55")
for i in range(2, 8): # 爬取第2,3页的数据
url = "http://www.lunwenchina.cn/html/11/category-catid-11-page-" + str(i)+".html"
print(url)
yield scrapy.Request(url,meta={'item':item},callback=self.parse)
print("????????///")
多页爬取就是观察列表页url规则,写url,再yield,注意,这里meta={'item':item}一定要有,否则代码不报错,但存入数据库只有一页的数据。
写爬虫文件时多输出如print("++++++")等符号,有助于观察你的代码在哪出问题,哪一步停止了。
2.管道文件
import pymongo
class NewsdataPipeline(object):
def process_item(self, item, spider):
coon=pymongo.MongoClient('localhost',27017)
db=coon.xxxxx #xxxx代表数据库名
table=db.yyyy #yyyy代表集合名,表名
table.insert_one(dict(item))
return item
3.setting.py
需要改动的代码
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
}
ITEM_PIPELINES = {
'Newsdata.pipelines.NewsdataPipeline': 300,
'Newsdata.pipelines.MongodbPipeline':400,
}
ROBOTSTXT_OBEY =False