import requests
from fake_useragent import UserAgent
agent = UserAgent()
num = 1
while num<20:
url = 'http://www.kfzj.com.cn/browse/category/%s'%num
num += 1
headers = {
'User-Agent':agent.random
}
response = requests.get(
url,
headers=headers,
)
print(response.url)
url中结尾为数字,使用while循环,并拼接字符串%s
使用知识点为while循环,字符串拼接
import scrapy
import os
import requests
from ..items import KfzjspiderItem
class KfzjSpider(scrapy.Spider):
name = 'kfzj'
allowed_domains = ['kfzj.com']
start_urls = ['http://www.kfzj.com.cn']
def parse(self, response):
"""
:param response:
:return:
"""
yield scrapy.Request(
url=response.url,
callback=self.parse_next_category,
meta={},
dont_filter=True,
)
def parse_next_category(self, response):
"""
解析大分类
:param response:
:return:
"""
category_list = response.xpath('//div[@id="header"]//li[@class="abc"]')
for category in category_list:
little_list = category.xpath('ul/li/h3')
for little in little_list:
href = little.xpath('a/@href').extract_first()
href = 'http://www.kfzj.com.cn'+href
category_name = little.xpath('a/text()').extract_first()
print(href, category_name)
yield scrapy.Request(
url=href,
callback=self.parse_xiang_qing,
meta={},
dont_filter=True,
)
def parse_xiang_qing(self, response):
"""
详情页
:param response:
:return:
"""
obj_list = response.xpath('//div[@class="category_pro_list"]/ul/li')
for a_list in obj_list:
img_src = a_list.xpath('div[@class="category_pro_pic"]/a/img/@src').extract_first('没有图片')
img_src='http://www.kfzj.com.cn'+img_src
name = a_list.xpath('div[@class="category_pro_name"]/a/text()').extract_first()
price = a_list.xpath('div[@class="category_pro_price"]/p/strong/span/text()').extract_first()
response=requests.get(img_src)
path='a/'+'b'
if not os.path.exists(path):
os.makedirs(path)
with open(path+'/'+'c.txt','a',encoding='utf-8')as f:
f.write(img_src)
f.write(name)
f.write(price)
f.write('\n')
with open(path+'/'+price+'.jpg','wb')as f:
f.write(response.content)
# item=KfzjspiderItem()
# item['img_src']=img_src
# item['name'] = name
# item['price'] = price
# yield item
print(img_src,name, price)
使用scrapy爬取商城,其中使用到open()函数,
wiht open() as f:
f.write()
使用不熟练!
另需根据抓取到的src地址下载图片,yield中的内容,以及两次for循环获取URL地址
scrapy载运行时出现
在cmd中输入scrapy crawl mySpider 提示如下
Scrapy 1.3.0 - no active project
Unknown command: crawl
Use "scrapy" to see available commands
出现该错误原因,删掉了这个文件