想法:整合全国各地高校保存下来,自己随时随地查询各高校官网信息!
资源网站:那些年,我们一起被折磨过的高考
资源提取方式:Scrapy爬虫
保存方式:mysql数据库
scrapy 项目 spider.py 代码:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from university.items import UniversityItem
class SchoolSpider(CrawlSpider):
name = 'school'
allowed_domains = ['u.feelingmsg.com']
start_urls = ['http://u.feelingmsg.com/u/guangdong.php']
rules = (
Rule(LinkExtractor(allow=r'http://u.feelingmsg.com/u/(.*?).php'), callback='parse_item', follow=True),
)
def parse_item(self, response):
item = UniversityItem()
results = response.xpath('/html/body/table[4]//tr')
for result in results:
schools = result.xpath('./td')
for school in schools:
name = school.xpath('./span/a/text()').get()
if name:
item['city'] = response.xpath