import scrapy
from scrapy.http import Request
from jj20.items import Jj20Item
class Jj20pSpider(scrapy.Spider):
name = "jj20p"
allowed_domains = ["jj20.com"]
start_urls = ['http://jj20.com/']
def parse(self, response):
urldata=response.xpath("//ul[@id='navul']//li/a/@href").extract()
print("第一层:"+urldata[1])
print("第一层:"+urldata[2])
print("第一层:"+urldata[3])
urldata = urldata[1:22] #只有前面1到21 的标签是 我们要抓取的
for i in urldata:
urlnew = response.urljoin(i)
yield Request(url=urlnew, callback=self.next)
def next(self,response):
print("第二层=============")
page = response.xpath('//div[@class="pagea"]/div[@class="page"]//a/text()').extract()
url2href = response.xpath('//div[@class="pagea"]/div[@class="page"]//a/@href').extract()
pages=page[-2]#页数
url2hrefstr=url2href[-2] #得到 list_1_69.html
print(url2hrefstr)
arr=url2hrefstr.split("_");
urlstr_half=arr[0]+"_"+arr[1]+"_"
#构造出所有页面的网址
for j in range(1,int(pages)):
halfstr=urlstr_half+str(j)+".html"
urlsnew=response.urljoin(halfstr)
print("urlsnew:"+urlsnew)
yield Request(url=urlsnew, callback=self.next2)
def next2(self,response):
print("第三层=============")
this=response.xpath('//ul[@class="pic2 vvi fix"]//li/a[1]/img/@src').extract()
print(this)
this2=response.xpath('//ul[@class="pic2 vvi fix"]//li/a[2]/@href').extract()
print(this2)
item=Jj20Item()
item["url"]=this
yield item
#for k in this:
#item["url"]=k
#print("k:"+k)
#yield item
#def next3(self,response):
#item=Jj20Item()
#item["url"]=response.url
#print("response.url:"+response.url)
#yield item
---------------------------------我是萌萌哒分割线--------------------------------------
###################### pipelines.py文件
import urllib.request
#import http.cookiejar
import random
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class Jj20Pipeline(object):
def process_item(self, item, spider):
print("len of item:"+str(len(item["url"])))
for m in item["url"]:
try:
that=m
print("正在爬取--- "+m)
print(that[-21:]) #倒数第21个开始像后截取->
file="E:/csdn-python公开课配套资料/r6/"+ that[-21:]
#headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11','Accept':'text/html;q=0.9,*/*;q=0.8', 'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3','Accept-Encoding':'gzip','Connection':'close', 'Referer':None }
#opener = urllib.request.build_opener()
#opener.addheaders = [headers]
#url="http://jj20.com"
#req_header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Accept':'text/html;q=0.9,*/*;q=0.8', 'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding':'gzip','Connection':'close', 'Referer':None}
#req_timeout = 5
#req = urllib.request.Request(url,None,req_header)
#resp = urllib.request.urlopen(req,None,req_timeout)
headers1=("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0")
headers2=("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36")
headers3=("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36")
headers4=("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36")
headers=headers1;
num=int(random.random()*5)#设置随机的header
if num==1:
headers=headers1
if num==2:
headers=headers2
if num==3:
headers=headers3
if num==4:
headers=headers4
if num==5:
headers=headers1
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
#给urllib设置建立全局的opener 带上header模拟浏览器才能够抓取到图片 (重点)
urllib.request.urlretrieve(that,filename=file)
except Exception as e:
print(e.code)
pass
return item