spiders写法:中meizi.py
# -*- coding: utf-8 -*-
import scrapy
import os,random
import requests
from pypinyin import lazy_pinyin
from Meizi.items import MeiziItem
UA=[
{'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},
{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},
{'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'},
{'User-Agent':'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)'},
{'User-Agent':'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)'},
{'User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)'},
{'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},
{'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},
{'User-Agent':'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11'},
{'User-Agent':'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'},
{'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'},
{'User-Agent':'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)'},
{'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)'},
{'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'},
{'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)'},
{'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)'},
{'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)'},
{'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)'},
{'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'},
{'User-Agent':'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'},
{'User-Agent':'Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'},
{'User-Agent':'Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'},
{'User-Agent': 'Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'},
{'User-Agent': 'MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'},
{'User-Agent': 'Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10'},
{'User-Agent': 'Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13'},
{'User-Agent': 'Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1'},
{'User-Agent': 'Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0'},
{'User-Agent': 'Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124'},
{'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)'},
{'User-Agent': 'UCWEB7.0.2.37/28/999'},
{'User-Agent': 'NOKIA5700/ UCWEB7.0.2.37/28/999'},
{'User-Agent': 'Openwave/ UCWEB7.0.2.37/28/999'},
{'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999'}]
class MeiziSpider(scrapy.Spider):
name = 'meizi'
allowed_domains = ['****.com']
# start_urls = ['http://***.com/']
def start_requests(self):
url="https://www.***.com/xinggan/page/"
for i in range(3,4):
yield scrapy.Request(url+str(i))
def parse(self, response):
name=response.xpath("//ul[@id='pins']/li/span/a/text()").extract()
url=response.xpath("//ul[@id='pins']/li/a/@href").extract()
header={
'referer': response.url,
'user-agent': random.choice(UA)["User-Agent"]
}
for i in url:
yield scrapy.Request(i,callback=self.parse_item,headers=header)
def parse_item(self,response):
name=response.xpath("//h2[@class='main-title']/text()").extract()
header={
'referer': response.url,
'user-agent': random.choice(UA)["User-Agent"]
}
# os.mkdir('/gg/Images/%s'%name[0])
endpage=response.xpath("//div[@class='pagenavi']/a[5]/span/text()").extract()
# print(response.url)
for i in range(1,int(endpage[0])+1):
yield scrapy.Request(response.url+'/'+str(i),headers=header,callback=self.parse_img)
def parse_img(self,response):
item=MeiziItem()
imgurl=response.xpath("//div[@class='main-image']/p/a/img/@src").extract()
# print(imgurl)
name=response.xpath("//div[@class='main-image']/p/a/img/@alt").extract()
imgname=response.xpath("//h2[@class='main-title']/text()").extract()[0][15:]
dirs='/home/gg/Images/%s'%name[0][15:]
# print(imgurl[0])
# url=response.url
item['imgurl']=imgurl[0]
item['imgname']=imgname
item['name']=name[0][15:]
yield item #也可以定制自己个性化命名使用requests模块下载图片
# header={
# 'Referer': '{}'.format(imgurl[0]),
# 'User-Agent': random.choice(UA)['User-Agent'],
# 'Accept-Encoding': 'gzip, deflate',
# 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
# 'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
# 'upgrade-insecure-requests': '1',
# 'Connection': 'keep-alive'}
# if not os.path.exists(dirs):
# print(111111111111)
# os.mkdir(dirs)
# with open(dirs+'/'+imgname+'.jpg', "wb+") as jpg:
# jpg.write(requests.get(imgurl[0], headers=header).content)
items.py:
import scrapy
class MeiziItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
imgurl=scrapy.Field()
imgname=scrapy.Field()
pipelines.py:
import scrapy,random
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
from Meizi.spiders.meizi import UA
class MeiziPipeline(object):
def process_item(self, item, spider):
return item
class ImagePipeline(ImagesPipeline):
def get_media_requests(self, item, info):
# print(info)
header={
'Referer': '{}'.format(item['imgurl']),
'User-Agent': random.choice(UA)['User-Agent'],
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
'upgrade-insecure-requests': '1',
'Connection': 'keep-alive'}
yield scrapy.Request(item['imgurl'],headers=header)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['imgurl'] = image_paths
return item
def file_path(self, request, response=None, info=None):
imgname = request.meta['imgname'].strip(' ')
# print(item)
# image_guid = request.url.split('/')[-1]
imgname = re.sub(r'[?\\*|“<>:/]', '', imgname)
name=request.meta['name'].strip(' ')
name=re.sub(r'[?\\*|“<>:/]', '', name)
filename = u'full/{}/{}'.format(name,imgname)
return filename
settings.py配置:
BOT_NAME = 'Meizi'
SPIDER_MODULES = ['Meizi.spiders']
NEWSPIDER_MODULE = 'Meizi.spiders'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 3
ITEM_PIPELINES = {
'Meizi.pipelines.MeiziPipeline': 300,
'Meizi.pipelines.ImagePipeline': 300, #开启图片管道
}
IMAGES_STORE ='/home/gg/Meizi/Meizi/Images' #设置存储路径
java抓取:
```java
package com.atgui.springboot.controller;
import org.apache.commons.collections.map.LinkedMap;
import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpStatus;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.HttpClientUtils;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.NoopHostnameVerifier;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.TrustStrategy;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.ssl.SSLContextBuilder;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import us.codecraft.xsoup.Xsoup;
import javax.net.ssl.SSLContext;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.channels.UnresolvedAddressException;
import java.security.KeyManagementException;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.*;
public class meizi {
public static String UA(){
List<String> list=new ArrayList<>();
list.add("Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML; like Gecko) Version/5.1 Safari/534.50");
list.add("Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML; like Gecko) Version/5.1 Safari/534.50");
list.add("Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0");
list.add("Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0");
list.add("Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0");
list.add("Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1");
list.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1");
list.add("Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1");
list.add("Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11");
list.add("Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11");
list.add( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML; like Gecko) Chrome/17.0.963.56 Safari/535.11");
list.add("Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0");
list.add( "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0");
list.add( "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1");
list.add( "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World");
list.add( "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0");
list.add( "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE");
list.add( "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser");
list.add( "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1");
list.add("Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML; like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5");
list.add("Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML; like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5");
list.add("Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML; like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5");
list.add( "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML; like Gecko) Version/4.0 Mobile Safari/533.1");
list.add( "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML; like Gecko) Version/4.0 Mobile Safari/533.1");
list.add( "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10");
list.add( "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML; like Gecko) Version/4.0 Safari/534.13");
list.add( "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML; like Gecko) Version/6.0.0.337 Mobile Safari/534.1");
list.add( "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML; like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0");
list.add( "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML; like Gecko) BrowserNG/7.1.18124");
list.add( "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan");
list.add( "UCWEB7.0.2.37/28/999");
list.add( "NOKIA5700/ UCWEB7.0.2.37/28/999");
list.add( "Openwave/ UCWEB7.0.2.37/28/999");
list.add( "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999");
list.add("Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0");
list.add("Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0");
list.add("Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0");
list.add("User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1");
list.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1");
list.add("Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1");
list.add("Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11");
list.add("Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11");
list.add( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML; like Gecko) Chrome/17.0.963.56 Safari/535.11");
list.add("Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0");
list.add( "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0");
list.add( "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1");
list.add( "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World");
list.add( "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0");
list.add( "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE");
list.add( "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser");
list.add( "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1");
list.add("Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML; like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5");
list.add("Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML; like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5");
list.add("Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML; like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5");
list.add( "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML; like Gecko) Version/4.0 Mobile Safari/533.1");
list.add( "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML; like Gecko) Version/4.0 Mobile Safari/533.1");
list.add( "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10");
list.add( "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML; like Gecko) Version/4.0 Safari/534.13");
list.add( "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML; like Gecko) Version/6.0.0.337 Mobile Safari/534.1");
list.add( "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML; like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0");
list.add( "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML; like Gecko) BrowserNG/7.1.18124");
list.add( "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan");
list.add( "UCWEB7.0.2.37/28/999");
list.add( "NOKIA5700/ UCWEB7.0.2.37/28/999");
list.add( "Openwave/ UCWEB7.0.2.37/28/999");
list.add( "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999");
// System.out.println(list.size());
Random random=new Random();
int n =random.nextInt(list.size());
return list.get(n);
}
/**
* 设置可访问https
* @return
*/
public static CloseableHttpClient createSSLClientDefault() {
try {
SSLContext sslContext = new SSLContextBuilder().loadTrustMaterial(null, new TrustStrategy() {
//信任所有
public boolean isTrusted(X509Certificate[] chain, String authType) throws CertificateException {
return true;
}
}).build();
SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(sslContext,SSLConnectionSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER);
return HttpClients.custom().setSSLSocketFactory(sslsf).build();
} catch (KeyManagementException e) {
e.printStackTrace();
} catch (NoSuchAlgorithmException e) {
e.printStackTrace();
} catch (KeyStoreException e) {
e.printStackTrace();
}
return HttpClients.createDefault();
}
public static String getHtml(String url) {
CloseableHttpClient httpClient=createSSLClientDefault();
String html=null;
try{
HttpGet httpGet=new HttpGet(url);
httpGet.setHeader("accept", "text/html");
httpGet.setHeader("accept-Encoding", "gzip, deflate,br");
httpGet.setHeader("accept-Language", "zh-CN,zh;q=0.8");
// httpGet.addHeader("Accept-Charset", "utf-8");
httpGet.setHeader("connection", "keep-alive");
//
httpGet.setHeader("referer","http://www.mzitu.com/xinggan/");
httpGet.setHeader("host"," www.mzitu.com");
// httpGet.setHeader("Cache-Control", "no-cache");
httpGet.setHeader("user-agent", UA());
CloseableHttpResponse response=httpClient.execute(httpGet);
if (response.getStatusLine().getStatusCode()== HttpStatus.SC_OK){
HttpEntity httpEntity=response.getEntity();
html= EntityUtils.toString(httpEntity,"utf-8");
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
else {
System.out.println("not 200");
System.out.println(EntityUtils.toString(response.getEntity()));
}
}catch (Exception e){
e.printStackTrace();
} finally {
HttpClientUtils.closeQuietly(httpClient);
}
// if (html==null){
// getHtml(url);
// }
return html;
}
public static List<List<String>> getResponse(String url,int page){
ArrayList<String> list=new ArrayList<>();
for (int i = 1; i <=page; i++) {
list.add(url+"/"+i);
}
List<List<String>> pageUrl=new ArrayList<>();
for (String s : list) {
String html = getHtml(s);
if (html==null){
return null;
}
Document document = Jsoup.parse(html);
String name= Xsoup.compile("//ul[@id='pins']/li/span/a/text()").evaluate(document).get();
List<String> url1 = Xsoup.compile("//ul[@id='pins']/li/a/@href").evaluate(document).list();
System.out.println(url1);
pageUrl.add(url1);
}
return pageUrl;
}
public static List<List<String>> parse(String url,int page)
{
List<List<String>> pageUrl = getResponse(url, page);
if (pageUrl==null){
return null;
}
List<List<String>> img=new ArrayList<>();
for (List<String> pageUrl_ : pageUrl){
for (String s : pageUrl_) {
String html = getHtml(s);
if (html == null) {
continue;
}
Document document = Jsoup.parse(html);
String name = Xsoup.compile("//h2[@class='main-title']/text()").evaluate(document).get();
String endpage = Xsoup.compile("//div[@class='pagenavi']/a[5]/span/text()").evaluate(document).get();
if (name == null) {
continue;
}
System.out.println("=====>>>" + name);
System.out.println("=====>>>" + endpage);
List<String> imgurl = new ArrayList<>();
try {
int i1 = Integer.parseInt(endpage);
System.out.println(i1);
if (i1 < 0) {
continue;
}
for (int i = 1; i <= i1; i++) {
imgurl.add(s + "/" + i);
}
img.add(imgurl);
} catch (Exception e) {
e.printStackTrace();
}
}
}
return img;
}
/*
imgurl=response.xpath("//div[@class='main-image']/p/a/img/@src").extract()
# print(imgurl)
name=response.xpath("//div[@class='main-image']/p/a/img/@alt").extract()
imgname=response.xpath("//h2[@class='main-title']/text()").extract()[0][15:]
dirs='/home/gong/gg/Images/%s'%name[0][15:]
*/
public static void parse_img(String url,int page){
List<List<String>> img = parse(url, page);
if (img==null){
System.out.println("网络");
return;
}
for (List<String> list : img) {
for (String s : list) {
String html = getHtml(s);
if (html==null){
continue;
}
Document document = Jsoup.parse(html);
String name= Xsoup.compile("//div[@class='main-image']/p/a/img/@alt").evaluate(document).get();
String imgname=Xsoup.compile("//h2[@class='main-title']/text()").evaluate(document).get();
String imgurl=Xsoup.compile("//div[@class='main-image']/p/a/img/@src").evaluate(document).get();
System.out.println("=====>>>"+name);
System.out.println("=====>>>"+imgurl);
System.out.println(imgname);
String path="F:\\meizi\\"+name+"\\";
CloseableHttpClient httpClient= createSSLClientDefault();
try{
HttpGet httpGet=new HttpGet(imgurl);
System.out.println("url="+s);
System.out.println("img="+imgurl);
httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3");
httpGet.setHeader("Referer",s);
httpGet.setHeader("User-Agent", UA());
CloseableHttpResponse response=httpClient.execute(httpGet);
if (response.getStatusLine().getStatusCode()== HttpStatus.SC_OK){
HttpEntity httpEntity=response.getEntity();
InputStream imghtml = httpEntity.getContent();
// byte[] bs=new byte[4096];
// int len=0;
if (imghtml==null){
continue;
}
File file=new File(path);
if (!file.exists()){
file.mkdir();
}
FileUtils.copyToFile(imghtml,new File(path+"\\"+imgname+".jpg"));
// FileOutputStream os = new FileOutputStream("F:\\meizi" + "\\" + imgname+".jpg");
// while ((len=imghtml.read(bs))!=-1){
// os.write(bs,0,len);
// }
System.out.println(file.getPath()+"\\"+imgname+"下载完成");
Thread.sleep(1000);
response.close();
}
else {
System.out.println("not 200");
System.out.println(EntityUtils.toString(response.getEntity()));
}
}catch (Exception e){
e.printStackTrace();
} finally {
HttpClientUtils.closeQuietly(httpClient);
}
}
}
}
public static void main(String[] args) {
// String b = UA();
// System.out.println(b);
String url="https://www.mzitu.com/xinggan/page";
parse_img(url,1);
// parse(url,1);
}
}