本文主要爬取西安论坛帖子,如果要爬取内容及评论热度等可以再深入爬取
首先建立spider xi'an,命令 scrapy genspider xian ixian.com
xian.py内容:
import scrapy
from scrapy.selector import Selector,HtmlXPathSelector
from scrapy.http import Request
class XianSpider(scrapy.Spider):
name = 'xian'
allowed_domains = ['ixian.cn']
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'}
start_urls = ['https://www.ixian.cn/forum-13-1.html']
url_set = set()
def parse(self, response):
#获取荣耀西安论坛城市发展板块所有文章内容及url
hxs1 = Selector(response).xpath("//div[@class='busforumlist_item_title']/a")
#print(hxs)
for item in hxs1:
#print(item.xpath(".//@href"))
title = item.xpath(".//text()").extract_first()
url = item.xpath(".//@href").extract_first()
#print(title,url)
from spider1.items import XianItem
obj = XianItem(title=title,url=url) #这里参数一定要加上参数名
yield obj
#获取所有页码url 分页url
hxs2 = Selector(response).xpath("//div[@class='pg']/a[re:test(@href,'https://www.ixian.cn/forum-13-\d+.html')]/@href").extract()
for url in hxs2:
if url in self.url_set:
#print("url已收集,重复数据 %s"%url)
pass
else:
self.url_set.add(url)
print("新增一条url: %s"%url)
#将新url加入到调度器,进行新的url递归访问并解析
yield Request(url=url,callback=self.parse)
item.py建立class类,存储爬出来的帖子类对象
import scrapy
class XianItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
url = scrapy.Field()
建立piplines, 爬出来的内容持久化到txt文档中
class XianPipeline(object):
def process_item(self, item, spider):
temp_news = "%s\n%s\n\n"%(item['title'],item['url'])
f = open('荣耀西安网帖子.txt','a+',encoding="utf-8") #a+表示追加内容
f.write(temp_news)
f.close()
#return item
setting文件中设置 爬取深度:DEPTH_LIMIT=2, 以及piplines
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'spider1.pipelines.XianPipeline': 300, #300代表pipline执行权重,顺序
}
DEPTH_LIMIT=1 #递归深度 0表示不设深度,无线循环
命令行执行:
scrapy crawl xian --nolog
爬取到3000多条帖子: