代码实现:
# kaoyan.py
# -*- coding: utf-8 -*-
from copy import deepcopy
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class KaoyanSpider(CrawlSpider):
name = 'kaoyan'
allowed_domains = ['kaoyan365.cn']
start_urls = ['http://www.kaoyan365.cn/kaoyantiaoji/tiaojixinxi/158281.html']
rules = (
# 提取各个省份的url地址
Rule(LinkExtractor(allow=r'http://www.kaoyan365.cn/kaoyantiaoji/tiaojixinxi/\d+?.html'), callback='parse_list',
follow=False),
)
def parse_list(self, response):
# 提取各个大学名称及链接
td_list = response.xpath('//div[@class="zg_list_left01_cont"]//td')
for td in td_list:
item = {}
item["university"] = td.xpath('.//text()').extract_first()
item["href"] = td.xpath('./a/@href').extract_first()
if item["href"]:
yield scrapy.Request(
item["href"],
callback=self.parse_university,
meta={"item": deepcopy(item)}
)
def parse_university(self, response):
# 获取网页详细内容
item = response.meta["item"]
item["content"] = response.xpath("//div[@class='zg_list_left01_cont']//text()").extract()
yield item
# pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import re
class TiaojiPipeline(object):
def process_item(self, item, spider):
# 写入文件
with open("考研调剂信息.txt", "a", encoding="utf-8") as f:
f.write("***" + item["university"] + ":" + item["href"] + "\n")
# 清理无效数据
self.clear_item(item["content"])
return item
def clear_item(self, content_list):
"""清理无效数据"""
for content in content_list:
content = re.sub(r"u3000", "", content)
with open("考研调剂信息.txt", "a", encoding="utf-8") as f:
f.write(content.strip() + "\n")