spider编码
class DangdangSpider(scrapy.Spider):
name = 'dangdang'
allowed_domains = ['dangdang.com']
start_urls = ['http://category.dangdang.com/pg1-cid4004279.html']
def parse(self, response):
li_list = response.xpath("//li[contains(@class, 'line')]")
for i, li in enumerate(li_list):
item = P9DangdangItem()
item["name"] = li.xpath(".//p[@class='name']/a/@title").extract_first()
item["price"] = li.xpath(".//p[@class='price']/span/text()").extract_first()
item["review_num"] = li.xpath(".//p[@class='star']/a/text()").extract_first()
item["shop_name"] = li.xpath(".//p[@class='link']/a/text()").extract_first()
item["img_src"] = li.xpath(".//a[@class='pic']/img/@src").extract_first() if i<8 else li.xpath(".//a[@class='pic']/img/@data-original").extract_first()
yield item
next_page_url = "http://category.dangdang.com/" + response.xpath("//li[@class='next']/a/@href").extract_first() if response.xpath("//li[@class='next']/a/@href").extract_first() else None
if next_page_url is not None:
yield scrapy.Request(url=next_page_url, callback=self.parse)
pipelines编码
class P9DangdangPipeline(object):
def __init__(self):
self.mongo_client = pymongo.MongoClient(host="localhost", port=27017)
self.to_data_path = self.mongo_client["dangdang"]["commodity"]
def process_item(self, item, spider):
self.to_data_path.insert_one(dict(item))
print(item["name"] + " save to mongo successfully")
return item
def close_spider(self, spider):
self.mongo_client.close()
class P9DangdangPipeline_save_img(object):
def process_item(self, item, spider):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
}
src = item["img_src"]
item["name"] = re.sub(r"/", "", item["name"])
file_name = "./data/imgs/" + item["name"] + ".jpg"
img_bytes = requests.get(src, headers=headers).content
with open(file_name, "wb") as fp:
fp.write(img_bytes)
print(file_name + "save to local path successfully")
items编码
class P9DangdangItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field()
price = scrapy.Field()
review_num = scrapy.Field()
shop_name = scrapy.Field()
img_src = scrapy.Field()
settings.py设置
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent': 'User-Agent,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;'
},
ITEM_PIPELINES = {
'p9_dangdang.pipelines.P9DangdangPipeline': 300,
'p9_dangdang.pipelines.P9DangdangPipeline_save_img' : 301
},
DOWNLOAD_DELAY = 1
爬取结果如下: