1.文件结构:
2.lesson.py代码
import scrapy
from ts.items import TsItem
from scrapy.http import Request
class LessonSpider(scrapy.Spider):
name = 'lesson'
allowed_domains = ['hellobi.com']
start_urls = ['https://edu.hellobi.com/course/1']
def parse(self, response):
item=TsItem()
item['title']=response.xpath("//ol[@class='breadcrumb']/li[@class='active']/text()").extract()
item['link'] = response.xpath("//ul[@class='nav nav-tabs']/li[@class='active']/a/@href").extract()
item['stu'] = response.xpath("//span[@class='course-view']/text()").extract()
yield item
for i in range(2,121): #控制课程数
url='https://edu.hellobi.com/course/'+str(i)
yield Request(url,callback=self.parse)
3.item.py代码
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class TsItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title=scrapy.Field()
link=scrapy.Field()
stu=scrapy.Field()
4.pipelines.py代码
class TsPipeline(object):
def __init__(self):
self.fh=open("D:/软件(学习)/Python/Test/chapter8/result/ts.txt","a")
def process_item(self, item, spider):
print(item['title'])
print(item['link'])
print(item['stu'])
print('~~~~~~')
self.fh.write(item['title'][0]+"\n"+item['link'][0]+"\n"+item['stu'][0]+"\n"+"~~~~~~~"+"\n")
return item
def close_spider(self):
self.fh.close()
5.setting.py代码
BOT_NAME = 'ts'
SPIDER_MODULES = ['ts.spiders']
NEWSPIDER_MODULE = 'ts.spiders'
...
ROBOTSTXT_OBEY = True
...
ITEM_PIPELINES = {
'ts.pipelines.TsPipeline': 300,
}
TXT文件展示: