scrapy爬当当网书籍信息

本次只爬取搜索的python相关的所有书籍
scrapy start project ddbook
(cd /ddbook/ddbook)
scrapy genspider -t basic book dangdang.com
然后打开 book.py

# http://search.dangdang.com/?key=Python&act=input&page_index=1#J_tab
# http://search.dangdang.com/?key=Python&act=input&page_index=2#J_tab
#一共100页
# -*- coding: utf-8 -*-
import scrapy
from ..items import DdbookItem
from scrapy.http import Request
import re

class BookSpider(scrapy.Spider):
    name = 'book'
    allowed_domains = ['dangdang.com']
    header = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"}
    # start_urls = ['http://dangdang.com/']

    def start_requests(self):
        for i in range(1, 2):
            # for i in range(1, 101):   #共100页
            url = "http://search.dangdang.com/?key=Python&act=input&page_index=" + str(i) + "#J_tab"
            print(url)
            yield Request(url, callback=self.parse)

    def parse(self, response):
        for j in range(1,61):  #每页60个
            try:
                item = DdbookItem()
                author = "//ul[@class='bigimg']/li[@class='line"+str(j)+"']"
                print(author)
                if response.xpath(author+"//a/@title"):
                    item["title"] = response.xpath(author+"//a/@title")[0].extract()
                    print(item["title"])
                else:
                    item["title"] = ''

                if response.xpath(author+"//a[@name='itemlist-author']/text()"):
                    item["author"] = response.xpath(author+"//a[@name='itemlist-author']/text()")[0].extract()
                    print(item["author"])
                else:
                    item["author"] = ''

                if response.xpath(author + "//span[@class='search_now_price']/text()"):
                    item["price"] = response.xpath(author + "//span[@class='search_now_price']/text()")[0].extract()
                    print(item["price"])
                else:
                    item["price"] = ''

                if response.xpath(author + "//a[@name='P_cbs']/text()"):
                    item["press"] = response.xpath(author + "//a[@name='P_cbs']/text()")[0].extract()
                    print(item["press"])
                else:
                    item["press"] = ''
                if response.xpath(author + "//span/text()"):
                    alldata = response.xpath(author + "//span/text()").extract()
                    # print(alldata)
                    # a = len(alldata)
                    data = alldata[len(alldata)-2]   #因为日期的位置从头数不一定是第七个,但一定是span的最后一个,又因为数组开头是0
                    # print(data)
                    # 因为出来的日期前面有个斜杠
                    pat = "\d{4}-\d{2}-\d{2}"
                    item["data"] = re.compile(pat).findall(data)[0]   #[0]是为了去掉"
                    print(item["data"])
                else:
                    data = ''

                yield item
            except Exception as e:
                print(e)
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值