spider文件如下:
import scrapy
import re
from Gaoxiao.items import GaoxiaoItem
class GaoxiaoSpider(scrapy.Spider):
name = 'gaoxiao'
allowed_domains = ['in985.com']
start_urls = [['https://www.in985.com/api/v1/college/introduce/000{}'.format(i) for i in range(1,10)],
['https://www.in985.com/api/v1/college/introduce/00{}'.format(i) for i in range(10, 100)],
['https://www.in985.com/api/v1/college/introduce/0{}'.format(i)for i in range(100, 1000)],
['https://www.in985.com/api/v1/college/introduce/{}'.format(i) for i in range(1000, 10000)]]
def start_requests(self):
for url_list in self.start_urls:
for url in url_list:
yield scrapy.Request(url,dont_filter=True,callback=self.parse)
def parse(self, response):
item = GaoxiaoItem()
if re.findall('"code":(.*?),',response.text)[0]!=str(40):
item['college_name']=re.findall('collegeName":"(.*?)","phone"', response.text, re.S)[0]
if re.findall('"url":"(.*?)",', response.text, re.S)==[]:
item['college_url']=''
else:
item['college_url']=re.findall('"url":"(.*?)",', response.text, re.S)[0]
if re.findall('"phone":"(.*?)",', response.text, re.S)==[]:
item['college_phone'] =''
else:
item['college_phone'] = re.findall('"phone":"(.*?)",', response.text, re.S)[0].replace(';',' ')
if re.findall('"introduce":"(.*?)",', response.text, re.S)==[]:
item['college_introduce'] =''
else:
introduce = re.findall('>(.*?)<', response.text, re.S)
#转化为字符串
item['college_introduce'] = ''.join(introduce).replace(' ','').replace('"','"')\
.replace(' ',' ').replace('&',' ').replace(';',' ')
#构造访问求学价值的url
college_code=response.url[-4:]
StudyValue_Url='https://www.in985.com/api/v1/college/studyValue/'+college_code
yield scrapy.Request(StudyValue_Url,meta=item,callback=self.parse_studyValue)
def parse_studyValue(self, response):
#把上一个item传递过来
item=response.meta
#返回一个字典
item['studyValue']=response.text
#构造访问招生简章的url
college_code = response.url[-4:]
Admission_Regulations_Url = 'https://www.in985.com/api/v1/college/rule/' + college_code
yield scrapy.Request(Admission_Regulations_Url, meta=item, callback=self.parse_Admission_Regulations)
def parse_Admission_Regulations(self,response):
# 把上一个item传递过来
item = response.meta
Regulation = re.findall('>(.*?)<', response.text, re.S)
item['Admission_Regulations'] = ''.join(Regulation).replace(' ','').replace("; ;",",").replace(';','')
# 构造访问专业实力的url
college_code = response.url[-4:]
MajorPower_Url = 'https://www.in985.com/api/v1/college/specialties/'+ college_code+ '?Keyword=&PageIndex=1&PerPageSize=1000&Sort='
yield scrapy.Request(MajorPower_Url, meta=item, callback=self.parse_MajorPower)
def parse_MajorPower(self, response):
college_code = response.url[-46:-42]
item=response.meta
if re.findall('"totalItems":(.*?)}', response.text)[0] == str(0):
item['MajorPower']=''
collegeHistoryId_url = 'https://www.in985.com/api/v1/history/college/' + college_code
yield scrapy.Request(collegeHistoryId_url, meta=item, callback=self.parse_HistoryId)
else:
MajorPower=re.findall('"items":\\[(.*?)\\],"pageIndex"', response.text, re.S)[0]
item['MajorPower']=MajorPower
collegeHistoryId_url = 'https://www.in985.com/api/v1/history/college/' + college_code
yield scrapy.Request(collegeHistoryId_url, meta=item, callback=self.parse_HistoryId)
def parse_HistoryId(self,response):
item=response.meta
if re.findall('"collegeHistoryId":(.*?),', response.text, re.S):
item['collegeHistoryId'] = re.findall('"data":\\[(.*?)\\]}', response.text, re.S)[0]
#把所有的collegeHistoryId放进list中
ID_list=re.findall('"collegeHistoryId":(.*?),', response.text, re.S)
item['college_AdmissionScore'] =[]
college_AdmissionScore_url = 'https://www.in985.com/api/v1/history/college/enrollDiff/' + ID_list[0]
yield scrapy.Request(college_AdmissionScore_url, meta={'item':item,'ID_list':ID_list}, callback=self.parse_college_AdmissionScore)
else:
item['collegeHistoryId']=None
item['college_AdmissionScore']=None
item['Admission_detail']=None
#item['major_AdmissionScore']=None
#后面的不再执行
yield item
def parse_college_AdmissionScore(self,response):
item = response.meta['item']
ID_list=response.meta['ID_list']
item['college_AdmissionScore'].append(re.findall('"data":\\[(.*?)\\]}', response.text, re.S)[0])
for url in ['https://www.in985.com/api/v1/history/college/enrollDiff/{}'.format(ID) for ID in ID_list[1:]]:
yield scrapy.Request(url, meta={'item':item,'ID_list':ID_list}, callback=self.parse_college_AdmissionScore)
if response.url=='https://www.in985.com/api/v1/history/college/enrollDiff/'+ID_list[-1]:
item['Admission_detail']=[]
Admission_detail_url= 'https://www.in985.com/api/v1/history/college/enrollDetail/' + ID_list[0]
yield scrapy.Request(Admission_detail_url, meta={'item':item,'ID_list':ID_list}, callback=self.parse_Admission_detail)
def parse_Admission_detail(self,response):
item = response.meta['item']
ID_list = response.meta['ID_list']
item['Admission_detail'].append(re.findall('"data":\\[(.*?)\\]}', response.text, re.S)[0])
for url in ['https://www.in985.com/api/v1/history/college/enrollDetail/{}'.format(ID) for ID in ID_list[1:]]:
yield scrapy.Request(url, meta={'item': item, 'ID_list': ID_list},
callback=self.parse_Admission_detail)
if response.url == 'https://www.in985.com/api/v1/history/college/enrollDetail/' + ID_list[-1]:
yield item
item文件如下:
import scrapy
class GaoxiaoItem(scrapy.Item):
# define the fields for your item here like:
college_name = scrapy.Field()
college_introduce = scrapy.Field()
college_phone= scrapy.Field()
college_url = scrapy.Field()
StudyValue = scrapy.Field()
Admission_Regulations=scrapy.Field()
MajorPower= scrapy.Field()
collegeHistoryId=scrapy.Field()
college_AdmissionScore = scrapy.Field()
Admission_detail = scrapy.Field()
major_AdmissionScore = scrapy.Field()
setting文件如下:
BOT_NAME = 'Gaoxiao'
SPIDER_MODULES = ['Gaoxiao.spiders']
NEWSPIDER_MODULE = 'Gaoxiao.spiders'
from fake_useragent import UserAgent
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = UserAgent(verify_ssl=False).random
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 100
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#根据文科/理科的不同来设置Authorization
#文科的headers
DEFAULT_REQUEST_HEADERS = {
'Authorization': 'eyJBbGciOiJSUzI1NiIsIlR5cCI6IkpXVCJ9.eyJJc3MiOm51bGwsIlN1YiI6IjE4MTQwMTUxNzAwOjg3NzI5OTQ2OjQ1MDI1IiwiQXVkIjoiMSIsIkV4cCI6MTU1MzYwNjA2OSwiSWF0IjoxLCJKdGkiOiIxMzUzMjk0MDgifQ.T6b9ZxpYxf7o7u1fRC5VTlquWyAIWYTmd-C9ucnBfHg',
'User-Agent': UserAgent().random
}
#理科的headers
#DEFAULT_REQUEST_HEADERS = {
# 'Authorization': 'eyJBbGciOiJSUzI1NiIsIlR5cCI6IkpXVCJ9.eyJJc3MiOm51bGwsIlN1YiI6IjEzNDM4OTk2NzUwOjE4NzIxMjM0ODM6MTk4OCIsIkF1ZCI6IjIiLCJFeHAiOjE1NTQwNjU3NjUsIklhdCI6MSwiSnRpIjoiMTk4Mjg0OTIyMCJ9.PfMUuQnJ78xK_nHSbseMJizOw6k3SWSqvVBNDej1b9k',
# 'User-Agent': UserAgent().random
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'Gaoxiao.middlewares.GaoxiaoSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'Gaoxiao.middlewares.GaoxiaoDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'Gaoxiao.pipelines.GaoxiaoPipeline': 300,
}
pipelines 如下:
import csv
import json
class GaoxiaoPipeline(object):
def open_spider(self,spider):
if spider.name == 'get_plan':
self.file=open('CollegePlan理科-1.txt','a')
if spider.name == 'gaoxiao':
self.file = open('文科--.csv', 'w')
self.writer = csv.writer(self.file)
self.writer.writerow(('college_name','college_url','college_phone','college_introduce','studyValue',
'Admission_Regulations','MajorPower','collegeHistoryId','college_AdmissionScore',
'Admission_detail')) #写入记录标题
if spider.name == 'li_ke':
self.file = open('文科专业录取分数.csv', 'w')
self.writer = csv.writer(self.file)
self.writer.writerow(('college_name','collegeHistoryId', 'major_AdmissionScore')) # 写入记录标题
def process_item(self, item, spider):
if spider.name=='get_plan':
plans=item['plans']
self.file.write(plans +'\n')
self.file.flush()
return item
if spider.name == 'gaoxiao':
self.writer.writerow((item['college_name'],item['college_url'],item['college_phone'],
item['college_introduce'],item['studyValue'],item['Admission_Regulations'],
item['MajorPower'],item['collegeHistoryId'], item['college_AdmissionScore'],
item['Admission_detail']))
self.file.flush()
return item
if spider.name == 'li_ke':
self.writer.writerow(( item['college_name'],item['collegeHistoryId'], item['major_AdmissionScore']))
self.file.flush()
return item
def close_spider(self,spider):
self.file.close()