scrapy小说完整示例

Punch77

于 2023-07-30 14:58:44 发布

阅读量98

点赞数

文章标签： scrapy

本文链接：https://blog.csdn.net/weixin_51119756/article/details/132007101

版权

spider

import scrapy
from dingdian.items import DingdianItem

class A123wxCcSpider(scrapy.Spider):
    name = '123wx_cc'
    # allowed_domains = ['fffffff']
    start_urls = ['https://www.123wx.cc/']

    def parse(self, response):
        types = response.xpath('//ul/li/a/@href').extract()[1:8]
        for one_page_url in map(lambda x:'{}{}'.format('https://www.123wx.cc',x),types):
            yield scrapy.Request(one_page_url,self.show_info)

    def show_info(self,response):
        # print(response.url)
        # print(response.status)
        base = response.xpath('//div[@class="l bd"]/ul/li')
        item = DingdianItem()
        # print(response.text)
        for i in base:
            book_name = i.xpath('span[2]/a/text()').extract_first()
            book_author = i.xpath('span[@class="s4"]/text()').extract_first('暂无')
            item['book_name'] = book_name
            item['book_author'] = book_author
            yield item

setting

ITEM_PIPELINES = {
   'dingdian.pipelines.DingdianPipeline': 300,
}


MYSQL_USERNAME = 'root'
MYSQL_PWD = '123456'
MYSQL_DB = 'xiaoshuo'

items

import scrapy


class DingdianItem(scrapy.Item):
    # define the fields for your item here like:
    book_name = scrapy.Field()
    book_author = scrapy.Field()

pipelines

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymysql

class DingdianPipeline:
    def __init__(self,username,password,db):
        self.username = username
        self.password = password
        self.db = db

    @classmethod
    def from_crawler(cls, crawler):
        return cls(username=crawler.settings.get('MYSQL_USERNAME'),password=crawler.settings.get('MYSQL_PWD'),db=crawler.settings.get('MYSQL_DB'))

    def open_spider(self, spider):
        self.client = pymysql.connect(user=self.username,password=self.password,db=self.db)
        self.cursor = self.client.cursor()

    def process_item(self, item, spider):
        book_name = item['book_name']
        book_author = item['book_author']
        sql = 'insert into tests(book_name,book_author) values ("{}","{}")'.format(book_name,book_author)
        self.cursor.execute(sql)
        self.client.commit()
        return item

    def close_spider(self, spider):
        self.client.close()

京东

spider

import scrapy
from lxml import etree

class JdComSpider(scrapy.Spider):
    name = 'jd_com'
    # allowed_domains = ['ddd']

    def start_requests(self):
        s = 26
        url_lists = []
        for page in range(1, 7):
            if page == 1:
                url = 'https://search.jd.com/s_new.php?keyword=%E5%A4%A7%E5%9C%B0%E7%93%9C&qrst=1&' \
                              'wq=%E5%A4%A7%E5%9C%B0%E7%93%9C&stock=1&pvid=3b8d62d7e2294365911b56aeb0653d3d&' \
                              'page={}&s={}&click=0'.format(1,1)
                url_lists.append(url)
            else:
                url = 'https://search.jd.com/s_new.php?keyword=%E5%A4%A7%E5%9C%B0%E7%93%9C&qrst=1&' \
                              'wq=%E5%A4%A7%E5%9C%B0%E7%93%9C&stock=1&pvid=3b8d62d7e2294365911b56aeb0653d3d&' \
                              'page={}&s={}&click=0'.format(page,s)
                url_lists.append(url)
                s+=26
        for url in url_lists:
            yield scrapy.Request(url,self.goods)

    def goods(self,response):
        response = response.text
        prices = etree.HTML(response).xpath('//div[@class="p-price"]/strong/i/text()')
        intros = etree.HTML(response).xpath('//div[@class="p-name p-name-type-2"]/a')
        intros_list = []
        for i in intros:
            intro = ''.join(i.xpath('em/text()')).strip()
            intros_list.append(intro)
        store_names = etree.HTML(response).xpath('//div[@class="p-shop"]/span/a/text()')
        for i in range(len(prices)):
            print(prices[i], intros_list[i], store_names[i])

setting

DEFAULT_REQUEST_HEADERS = {
  'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36x-requested-with: XMLHttpRequest',
    'referer': 'https://search.jd.com/Search?keyword=%E5%A4%A7%E5%9C%B0%E7%93%9C&qrst=1&wq=%E5%A4%A7%E5%9C%B0%E7%93%9C&stock=1&pvid=1bf8e67af6b74013a464ee84079faf98&page=1&s=1&click=0'
}