# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT ='Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'# Obey robots.txt rules
ROBOTSTXT_OBEY =False
代码——运行,scrapy crawl douban
当当练习
spider
import scrapy
import bs4
from..items import DangdangItem
# 需要引用DoubanItem,它在items里面。因为是items在top250.py的上一级目录,所以要用..items,这是一个固定用法。classDangdangSpiders(scrapy.Spider):
name ='dangdang'
allowed_domains =['http://bang.dangdang.com']
start_urls =[]for x inrange(1,4):
url ='http://bang.dangdang.com/books/bestsellers/01.00.00.00.00.00-year-2018-0-1-'+str(x)
start_urls.append(url)defparse(self, response):
bs=bs4.BeautifulSoup(response.text,'html.parser')
bs_list=bs.find(class_="bang_list clearfix bang_list_mode").find_all("li")for it in bs_list:
item = DangdangItem()
item['name']=it.find(class_='name').find("a").text
actor_list=it.find(class_='publisher_info').find_all('a')
actor_list_i=[]for qqq in actor_list:
actor_list_i.append(qqq.text)
item['ator']=','.join(actor_list_i)
item['price']=it.find(class_='price').find(class_="price_n").text
print(item['name'])yield item
items
# -*- coding: utf-8 -*-# Define here the models for your scraped items## See documentation in:# https://docs.scrapy.org/en/latest/topics/items.htmlimport scrapy
classDangdangItem(scrapy.Item):
name = scrapy.Field()#定义书名的数据属性
ator = scrapy.Field()#定义出版信息的数据属性
price = scrapy.Field()