python日志2018.3.2

import requests
from fake_useragent import UserAgent
agent = UserAgent()

num = 1
while num<20:

    url = 'http://www.kfzj.com.cn/browse/category/%s'%num
    num += 1
    headers = {
        'User-Agent':agent.random
    }
    response = requests.get(
        url,
        headers=headers,
    )
    print(response.url)

url中结尾为数字,使用while循环,并拼接字符串%s

使用知识点为while循环,字符串拼接

import scrapy
import os
import requests
from ..items import KfzjspiderItem

class KfzjSpider(scrapy.Spider):
    name = 'kfzj'
    allowed_domains = ['kfzj.com']
    start_urls = ['http://www.kfzj.com.cn']

    def parse(self, response):
        """
        :param response:
        :return:
        """
        yield scrapy.Request(
            url=response.url,
            callback=self.parse_next_category,
            meta={},
            dont_filter=True,
        )

    def parse_next_category(self, response):
        """
        解析大分类
        :param response:
        :return:
        """
        category_list = response.xpath('//div[@id="header"]//li[@class="abc"]')
        for category in category_list:
            little_list = category.xpath('ul/li/h3')
            for little in little_list:
                href = little.xpath('a/@href').extract_first()
                href = 'http://www.kfzj.com.cn'+href
                category_name = little.xpath('a/text()').extract_first()
                print(href, category_name)
                yield scrapy.Request(
                    url=href,
                    callback=self.parse_xiang_qing,
                    meta={},
                    dont_filter=True,
                )

    def parse_xiang_qing(self, response):
        """
        详情页
        :param response:
        :return:
        """
        obj_list = response.xpath('//div[@class="category_pro_list"]/ul/li')
        for a_list in obj_list:
            img_src = a_list.xpath('div[@class="category_pro_pic"]/a/img/@src').extract_first('没有图片')
            img_src='http://www.kfzj.com.cn'+img_src
            name = a_list.xpath('div[@class="category_pro_name"]/a/text()').extract_first()
            price = a_list.xpath('div[@class="category_pro_price"]/p/strong/span/text()').extract_first()
            response=requests.get(img_src)
            path='a/'+'b'
            if not os.path.exists(path):
                os.makedirs(path)

            with open(path+'/'+'c.txt','a',encoding='utf-8')as f:
                f.write(img_src)
                f.write(name)
                f.write(price)
                f.write('\n')
            with open(path+'/'+price+'.jpg','wb')as f:
                f.write(response.content)
            # item=KfzjspiderItem()
            # item['img_src']=img_src
            # item['name'] = name
            # item['price'] = price
            # yield item
            print(img_src,name, price)

使用scrapy爬取商城,其中使用到open()函数,

wiht open() as f:

f.write()

使用不熟练!

另需根据抓取到的src地址下载图片,yield中的内容,以及两次for循环获取URL地址

scrapy载运行时出现

在cmd中输入scrapy crawl mySpider 提示如下


Scrapy 1.3.0 - no active project


Unknown command: crawl


Use "scrapy" to see available commands

出现该错误原因,删掉了这个文件


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值