Python——爬取校线,专业线

import scrapy
import pandas as pd

from scrapy.http import Request
from third.items import Sch_scoreItem


class SchScoreSpider(scrapy.Spider):
    name = 'sch_score'
    allowed_domains = ['gkcx.eol.cn']

    def start_requests(self):
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"}

        urls = pd.read_csv("/home/kong/Desktop/result.csv")
        nrow = urls.shape[0]
        print(nrow)

        for i in range(0, 2):
            url = [10154, 10036, 10037, 10038, 10149]
            for j in url:
                url = "https://gkcx.eol.cn/schoolhtm/schoolAreaPoint/"+str(urls.values[i][0])+"/10021/10035/"+str(j)+".htm"
                yield Request(url, headers=headers, meta={'mt1': url})

    def parse(self, response,):
        url = response.meta['mt1']

        item = Sch_scoreItem()
        item["school"] = response.xpath('//p[@class="li-school-label"]/span/text()').extract()
        data = pd.read_html(url)[0]
        #item["score"] = pd.read_html(url)[0]

        if data.values[0][0] == "暂时没有数据":
            print("无数据")
        else:
            school = pd.DataFrame([item["school"]])
            school.to_csv('/home/kong/Desktop/sch_score2.csv', sep=',', mode='a', index=0,)
            data.to_csv('/home/kong/Desktop/sch_score2.csv', sep=',', mode='a', index=0,)

import scrapy
import pandas as pd

from scrapy.http import Request
from third.items import Sch_scoreItem


class SchScoreSpider(scrapy.Spider):
    name = 'sch_score'
    allowed_domains = ['gkcx.eol.cn']

    def start_requests(self):
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"}

        urls = pd.read_csv("C:/Users/Administrator/Desktop/result.csv")
        nrow = urls.shape[0]
        #print(nrow)

        for i in range(0, nrow):
            url = [2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018]
            for j in url:
                url = "https://gkcx.eol.cn/schoolhtm/specialty/"+str(urls.values[i][0])+"/10035/specialtyScoreDetail_"+str(j)+"_10021.htm"
                yield Request(url, headers=headers, meta={'mt1': url})

    def parse(self, response,):
        url = response.meta['mt1']

        item = Sch_scoreItem()
        item["school"] = response.xpath('//p[@class="li-school-label"]/span/text()').extract()
        data = pd.read_html(url)[0]

        if data.values[0][0] == "暂时没有数据":
            print("无数据")
        else:
            school = pd.DataFrame([item["school"]])
            school.to_csv('C:/Users/Administrator/Desktop/score_li.csv', sep=',', mode='a', index=0,header=0)
            data.to_csv('C:/Users/Administrator/Desktop/score_li.csv', sep=',', mode='a', index=0,)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值