scrapy 爬取某图源码

spiders写法:中meizi.py

# -*- coding: utf-8 -*-
import scrapy
import os,random
import requests
from pypinyin import lazy_pinyin
from Meizi.items import MeiziItem
UA=[
    {'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},

    {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},
    {'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'},
    {'User-Agent':'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)'},
    {'User-Agent':'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)'},
    {'User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)'},

    {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},
    {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},
    {'User-Agent':'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11'},

    {'User-Agent':'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'},

    {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'},
    {'User-Agent':'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)'},
    {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)'},
    {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'},
    {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)'},
    {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)'},
    {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)'},
    {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)'},
    {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'},
    {'User-Agent':'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'},
    {'User-Agent':'Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'},
    {'User-Agent':'Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'},
    {'User-Agent': 'Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'},
    {'User-Agent': 'MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'},
    {'User-Agent': 'Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10'},
    {'User-Agent': 'Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13'},
    {'User-Agent': 'Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1'},
    {'User-Agent': 'Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0'},
    {'User-Agent': 'Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124'},
    {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)'},
    {'User-Agent': 'UCWEB7.0.2.37/28/999'},
    {'User-Agent': 'NOKIA5700/ UCWEB7.0.2.37/28/999'},
    {'User-Agent': 'Openwave/ UCWEB7.0.2.37/28/999'},
    {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999'}]
class MeiziSpider(scrapy.Spider):
    name = 'meizi'
    allowed_domains = ['****.com']
    # start_urls = ['http://***.com/']
    
    def start_requests(self):
    	url="https://www.***.com/xinggan/page/"
    	for i in range(3,4):
    		yield scrapy.Request(url+str(i))

    def parse(self, response):

        name=response.xpath("//ul[@id='pins']/li/span/a/text()").extract()
       	url=response.xpath("//ul[@id='pins']/li/a/@href").extract()
       	header={
       	'referer': response.url,
		'user-agent': random.choice(UA)["User-Agent"]
		       	}
       	for i in url:
       		yield scrapy.Request(i,callback=self.parse_item,headers=header)
    def parse_item(self,response):
    	name=response.xpath("//h2[@class='main-title']/text()").extract()
    	header={
       	'referer': response.url,
		'user-agent': random.choice(UA)["User-Agent"]
		       	}
    	# os.mkdir('/gg/Images/%s'%name[0])
    	endpage=response.xpath("//div[@class='pagenavi']/a[5]/span/text()").extract()
    	# print(response.url)
    	for i in range(1,int(endpage[0])+1):
    		yield scrapy.Request(response.url+'/'+str(i),headers=header,callback=self.parse_img)
    def parse_img(self,response):
        item=MeiziItem()
        imgurl=response.xpath("//div[@class='main-image']/p/a/img/@src").extract()
        # print(imgurl)
        name=response.xpath("//div[@class='main-image']/p/a/img/@alt").extract()
        imgname=response.xpath("//h2[@class='main-title']/text()").extract()[0][15:]
        dirs='/home/gg/Images/%s'%name[0][15:]
        # print(imgurl[0])
        # url=response.url
        item['imgurl']=imgurl[0]
        item['imgname']=imgname
        item['name']=name[0][15:]
        yield item    #也可以定制自己个性化命名使用requests模块下载图片
       #  header={
       #  'Referer': '{}'.format(imgurl[0]),
       #  'User-Agent': random.choice(UA)['User-Agent'],
       # 'Accept-Encoding': 'gzip, deflate',
       #  'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
       #  'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
       #  'upgrade-insecure-requests': '1',
       #  'Connection': 'keep-alive'}
        
        # if not os.path.exists(dirs):
        #     print(111111111111)
        #     os.mkdir(dirs)
        # with open(dirs+'/'+imgname+'.jpg', "wb+") as jpg:
        #     jpg.write(requests.get(imgurl[0], headers=header).content)

items.py:

import scrapy


class MeiziItem(scrapy.Item):
    # define the fields for your item here like:
    name = scrapy.Field()
    imgurl=scrapy.Field()
    imgname=scrapy.Field()
pipelines.py:
import scrapy,random
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
from Meizi.spiders.meizi import UA 
class MeiziPipeline(object):
    def process_item(self, item, spider):
        return item
class ImagePipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
    	# print(info)
        header={
        'Referer': '{}'.format(item['imgurl']),
        'User-Agent': random.choice(UA)['User-Agent'],
      	'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
        'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
        'upgrade-insecure-requests': '1',
        'Connection': 'keep-alive'}
        yield scrapy.Request(item['imgurl'],headers=header)
    def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        item['imgurl'] = image_paths
        return item 
 def file_path(self, request, response=None, info=None):
        imgname = request.meta['imgname'].strip(' ')
        # print(item)
        # image_guid = request.url.split('/')[-1]
        imgname = re.sub(r'[?\\*|“<>:/]', '', imgname)
        name=request.meta['name'].strip(' ')
        name=re.sub(r'[?\\*|“<>:/]', '', name)
        filename = u'full/{}/{}'.format(name,imgname)
        return filename

settings.py配置:

BOT_NAME = 'Meizi'

SPIDER_MODULES = ['Meizi.spiders']
NEWSPIDER_MODULE = 'Meizi.spiders'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 3
ITEM_PIPELINES = {
   'Meizi.pipelines.MeiziPipeline': 300,
   'Meizi.pipelines.ImagePipeline': 300, #开启图片管道
}
IMAGES_STORE ='/home/gg/Meizi/Meizi/Images'  #设置存储路径

java抓取:

```java
package com.atgui.springboot.controller;

import org.apache.commons.collections.map.LinkedMap;
import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpStatus;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.HttpClientUtils;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.NoopHostnameVerifier;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.TrustStrategy;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.ssl.SSLContextBuilder;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import us.codecraft.xsoup.Xsoup;

import javax.net.ssl.SSLContext;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.channels.UnresolvedAddressException;
import java.security.KeyManagementException;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.*;

public class meizi {
    public static String UA(){
        List<String> list=new ArrayList<>();
        list.add("Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML; like Gecko) Version/5.1 Safari/534.50");
        list.add("Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML; like Gecko) Version/5.1 Safari/534.50");
        list.add("Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0");
        list.add("Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0");
        list.add("Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0");
        list.add("Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1");

        list.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1");
        list.add("Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1");
        list.add("Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11");

        list.add("Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11");

        list.add( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML; like Gecko) Chrome/17.0.963.56 Safari/535.11");
        list.add("Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0");
        list.add( "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0");
        list.add( "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1");
        list.add( "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World");
        list.add( "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0");
        list.add( "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE");
        list.add( "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser");
        list.add( "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1");
        list.add("Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML; like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5");
        list.add("Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML; like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5");
        list.add("Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML; like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5");
        list.add( "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML; like Gecko) Version/4.0 Mobile Safari/533.1");
        list.add( "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML; like Gecko) Version/4.0 Mobile Safari/533.1");
        list.add( "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10");
        list.add( "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML; like Gecko) Version/4.0 Safari/534.13");
        list.add( "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML; like Gecko) Version/6.0.0.337 Mobile Safari/534.1");
        list.add( "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML; like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0");
        list.add( "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML; like Gecko) BrowserNG/7.1.18124");
        list.add( "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan");
        list.add( "UCWEB7.0.2.37/28/999");
        list.add( "NOKIA5700/ UCWEB7.0.2.37/28/999");
        list.add( "Openwave/ UCWEB7.0.2.37/28/999");
        list.add( "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999");
        list.add("Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0");
        list.add("Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0");
        list.add("Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0");
        list.add("User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1");

        list.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1");

        list.add("Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1");
        list.add("Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11");

        list.add("Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11");

        list.add( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML; like Gecko) Chrome/17.0.963.56 Safari/535.11");
        list.add("Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0");
        list.add( "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0");
        list.add( "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1");
        list.add( "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World");
        list.add( "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0");
        list.add( "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE");
        list.add( "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser");
        list.add( "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1");
        list.add("Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML; like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5");
        list.add("Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML; like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5");
        list.add("Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML; like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5");
        list.add( "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML; like Gecko) Version/4.0 Mobile Safari/533.1");
        list.add( "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML; like Gecko) Version/4.0 Mobile Safari/533.1");
        list.add( "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10");
        list.add( "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML; like Gecko) Version/4.0 Safari/534.13");
        list.add( "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML; like Gecko) Version/6.0.0.337 Mobile Safari/534.1");
        list.add( "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML; like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0");
        list.add( "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML; like Gecko) BrowserNG/7.1.18124");
        list.add( "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan");
        list.add( "UCWEB7.0.2.37/28/999");
        list.add( "NOKIA5700/ UCWEB7.0.2.37/28/999");
        list.add( "Openwave/ UCWEB7.0.2.37/28/999");
        list.add( "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999");
//        System.out.println(list.size());
        Random random=new Random();
        int n =random.nextInt(list.size());
        return list.get(n);

        }
    /**
     * 设置可访问https
     * @return
     */
    public static CloseableHttpClient createSSLClientDefault() {
        try {
            SSLContext sslContext = new SSLContextBuilder().loadTrustMaterial(null, new TrustStrategy() {
                //信任所有
                public boolean isTrusted(X509Certificate[] chain, String authType) throws CertificateException {
                    return true;
                }
            }).build();
            SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(sslContext,SSLConnectionSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER);

            return HttpClients.custom().setSSLSocketFactory(sslsf).build();
        } catch (KeyManagementException e) {
            e.printStackTrace();
        } catch (NoSuchAlgorithmException e) {
            e.printStackTrace();
        } catch (KeyStoreException e) {
            e.printStackTrace();
        }
        return HttpClients.createDefault();
    }
    public static String getHtml(String url) {

        CloseableHttpClient httpClient=createSSLClientDefault();
        String html=null;
        try{

            HttpGet httpGet=new HttpGet(url);

            httpGet.setHeader("accept", "text/html");
            httpGet.setHeader("accept-Encoding", "gzip, deflate,br");
            httpGet.setHeader("accept-Language", "zh-CN,zh;q=0.8");
//            httpGet.addHeader("Accept-Charset", "utf-8");
            httpGet.setHeader("connection", "keep-alive");
//
            httpGet.setHeader("referer","http://www.mzitu.com/xinggan/");
            httpGet.setHeader("host"," www.mzitu.com");
//            httpGet.setHeader("Cache-Control", "no-cache");
            httpGet.setHeader("user-agent", UA());
            CloseableHttpResponse response=httpClient.execute(httpGet);
            if (response.getStatusLine().getStatusCode()== HttpStatus.SC_OK){
                HttpEntity httpEntity=response.getEntity();
                html= EntityUtils.toString(httpEntity,"utf-8");
                try {
                    Thread.sleep(1000);
                } catch (InterruptedException e) {
                    e.printStackTrace();
                }
            }
            else {
                System.out.println("not 200");
                System.out.println(EntityUtils.toString(response.getEntity()));
            }

        }catch (Exception e){
            e.printStackTrace();
        } finally {
            HttpClientUtils.closeQuietly(httpClient);
        }
//        if (html==null){
//            getHtml(url);
//        }
        return html;

    }
    public static List<List<String>> getResponse(String url,int page){
        ArrayList<String> list=new ArrayList<>();
        for (int i = 1; i <=page; i++) {
            list.add(url+"/"+i);
        }
        List<List<String>> pageUrl=new ArrayList<>();
        for (String s : list) {
            String html = getHtml(s);
            if (html==null){
                return null;
            }
            Document document = Jsoup.parse(html);
            String name= Xsoup.compile("//ul[@id='pins']/li/span/a/text()").evaluate(document).get();
            List<String> url1 = Xsoup.compile("//ul[@id='pins']/li/a/@href").evaluate(document).list();
            System.out.println(url1);
            pageUrl.add(url1);

        }
        return pageUrl;
    }
    public static List<List<String>>  parse(String url,int page)
    {
        List<List<String>> pageUrl = getResponse(url, page);
        if (pageUrl==null){
            return null;
        }

        List<List<String>> img=new ArrayList<>();
        for (List<String> pageUrl_ : pageUrl){
        for (String s : pageUrl_) {

            String html = getHtml(s);
            if (html == null) {
               continue;
            }
            Document document = Jsoup.parse(html);
            String name = Xsoup.compile("//h2[@class='main-title']/text()").evaluate(document).get();
            String endpage = Xsoup.compile("//div[@class='pagenavi']/a[5]/span/text()").evaluate(document).get();
            if (name == null) {
               continue;
            }
            System.out.println("=====>>>" + name);
            System.out.println("=====>>>" + endpage);
            List<String> imgurl = new ArrayList<>();
            try {
                int i1 = Integer.parseInt(endpage);
                System.out.println(i1);
                if (i1 < 0) {

                    continue;
                }
                for (int i = 1; i <= i1; i++) {
                    imgurl.add(s + "/" + i);
                }
                img.add(imgurl);
            } catch (Exception e) {
                e.printStackTrace();
            }

        }

        }

        return img;
    }
    /*
        imgurl=response.xpath("//div[@class='main-image']/p/a/img/@src").extract()
        # print(imgurl)
        name=response.xpath("//div[@class='main-image']/p/a/img/@alt").extract()
        imgname=response.xpath("//h2[@class='main-title']/text()").extract()[0][15:]
        dirs='/home/gong/gg/Images/%s'%name[0][15:]
    */
    public static  void parse_img(String url,int page){
        List<List<String>> img = parse(url, page);
        if (img==null){
            System.out.println("网络");
            return;
        }
        for (List<String> list : img) {
            for (String s : list) {

                String html = getHtml(s);
                if (html==null){
                    continue;
                }
                Document document = Jsoup.parse(html);
                String name= Xsoup.compile("//div[@class='main-image']/p/a/img/@alt").evaluate(document).get();
                String imgname=Xsoup.compile("//h2[@class='main-title']/text()").evaluate(document).get();
                String imgurl=Xsoup.compile("//div[@class='main-image']/p/a/img/@src").evaluate(document).get();
                System.out.println("=====>>>"+name);
                System.out.println("=====>>>"+imgurl);
                System.out.println(imgname);
                String path="F:\\meizi\\"+name+"\\";
                CloseableHttpClient httpClient= createSSLClientDefault();
                try{
                    HttpGet httpGet=new HttpGet(imgurl);
                    System.out.println("url="+s);
                    System.out.println("img="+imgurl);
                    httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3");

                    httpGet.setHeader("Referer",s);
                    httpGet.setHeader("User-Agent", UA());
                    CloseableHttpResponse response=httpClient.execute(httpGet);
                    if (response.getStatusLine().getStatusCode()== HttpStatus.SC_OK){
                        HttpEntity httpEntity=response.getEntity();
                        InputStream imghtml = httpEntity.getContent();
//                        byte[] bs=new byte[4096];
//                        int len=0;
                        if (imghtml==null){
                           continue;
                        }
                        File file=new File(path);
                        if (!file.exists()){
                            file.mkdir();
                        }
                        FileUtils.copyToFile(imghtml,new File(path+"\\"+imgname+".jpg"));
//                        FileOutputStream os = new FileOutputStream("F:\\meizi" + "\\" + imgname+".jpg");
//                        while ((len=imghtml.read(bs))!=-1){
//                            os.write(bs,0,len);
//                        }
                        System.out.println(file.getPath()+"\\"+imgname+"下载完成");

                        Thread.sleep(1000);

                        response.close();

                    }
                    else {
                        System.out.println("not 200");
                        System.out.println(EntityUtils.toString(response.getEntity()));
                    }

                }catch (Exception e){
                    e.printStackTrace();
                } finally {
                    HttpClientUtils.closeQuietly(httpClient);
                }
                
            }
        }
    }
    public static void main(String[] args) {
//        String b = UA();
//        System.out.println(b);
        String url="https://www.mzitu.com/xinggan/page";

            parse_img(url,1);


//        parse(url,1);


    }
}


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值