import scrapy
import pandas as pd
from scrapy.http import Request
from third.items import Sch_scoreItem
class SchScoreSpider(scrapy.Spider):
name = 'sch_score'
allowed_domains = ['gkcx.eol.cn']
def start_requests(self):
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"}
urls = pd.read_csv("/home/kong/Desktop/result.csv")
nrow = urls.shape[0]
print(nrow)
for i in range(0, 2):
url = [10154, 10036, 10037, 10038, 10149]
for j in url:
url = "https://gkcx.eol.cn/schoolhtm/schoolAreaPoint/"+str(urls.values[i][0])+"/10021/10035/"+str(j)+".htm"
yield Request(url, headers=headers, meta={'mt1': url})
def parse(self, response,):
url = response.meta['mt1']
item = Sch_scoreItem()
item["school"] = response.xpath('//p[@class="li-school-label"]/span/text()').extract()
data = pd.read_html(url)[0]
#item["score"] = pd.read_html(url)[0]
if data.values[0][0] == "暂时没有数据":
print("无数据")
else:
school = pd.DataFrame([item["school"]])
school.to_csv('/home/kong/Desktop/sch_score2.csv', sep=',', mode='a', index=0,)
data.to_csv('/home/kong/Desktop/sch_score2.csv', sep=',', mode='a', index=0,)
import scrapy
import pandas as pd
from scrapy.http import Request
from third.items import Sch_scoreItem
class SchScoreSpider(scrapy.Spider):
name = 'sch_score'
allowed_domains = ['gkcx.eol.cn']
def start_requests(self):
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"}
urls = pd.read_csv("C:/Users/Administrator/Desktop/result.csv")
nrow = urls.shape[0]
#print(nrow)
for i in range(0, nrow):
url = [2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018]
for j in url:
url = "https://gkcx.eol.cn/schoolhtm/specialty/"+str(urls.values[i][0])+"/10035/specialtyScoreDetail_"+str(j)+"_10021.htm"
yield Request(url, headers=headers, meta={'mt1': url})
def parse(self, response,):
url = response.meta['mt1']
item = Sch_scoreItem()
item["school"] = response.xpath('//p[@class="li-school-label"]/span/text()').extract()
data = pd.read_html(url)[0]
if data.values[0][0] == "暂时没有数据":
print("无数据")
else:
school = pd.DataFrame([item["school"]])
school.to_csv('C:/Users/Administrator/Desktop/score_li.csv', sep=',', mode='a', index=0,header=0)
data.to_csv('C:/Users/Administrator/Desktop/score_li.csv', sep=',', mode='a', index=0,)