在文件夹里创建一个爬虫项目
scrapy startproject ITcast
在spiders目录下:
scrapy genspider itcast
---------------------------------------------------------------------------------------------------------------------------------------------------------------
#items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class ItcastItem(scrapy.Item):
# define the fields for your item here like:
#老师姓名
name = scrapy.Field()
#老师职称
title = scrapy.Field()
#老师信息
info = scrapy.Field()
# -*- coding: utf-8 -*-
import scrapy
class ItcastSpider(scrapy.Spider):
name = 'itcast'#必需参数
#爬取域范围,允许爬虫在这个域名下进行爬取(可选)
allowed_domains = ['http://www.itcast.cn']
#起始url列表,爬虫执行后第一批请求,将从这个列表中获取
start_urls = ['http://www.itcast.cn/channel/teacher.shtml']
#在scrapy里,在parse方法外的所有的属性都是初始化的
def parse(self, response):#必须参数
node_list=response.xpath("//div[@class='tea_con']//div[@class='li_txt']")
for node in node_list:
#.extract()将xpath对象转换为Unicode字符串
name = node.xpath("./h3/text()").extract()
title = node.xpath("./h4/text()").extract()
info = node.xpath("./p/text()").extract()
print(name[0])
print(title[0])
print(info[0])
执行命令:
scrapy cr