java多线程爬论文

学习笔记仅供参考

1.xiancheng.class

package com.example.util;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.*;
import java.net.*;

import java.util.ArrayList;
import java.util.Random;

public class xiancheng implements Runnable {

    static ArrayList<String> ips = new ArrayList<String>();
    public ArrayList<LW> lws = new ArrayList<>();
    static xiancheng xian = new xiancheng();
    static String name="";
    int q = 0;
    public xiancheng(){

    }
    public xiancheng(ArrayList<String> ipss, int pages) {
        ips = ipss;
        q = pages;
    }

    public ArrayList<LW>getlws(){
        return lws;
    }
     public String getip() throws InterruptedException {
         System.getProperties().setProperty("http.proxuHost","113.140.84.97");
         System.getProperties().setProperty("http.proxyPort","80");
        String ip = new pachong().ips1().get(0);
         System.out.println("IP是:"+ip);
        return ip;
    }
    @Override
    public void run() {


            Random x = new Random();
            String path = "";

//        String [] ips = {"120.79.64.147","175.148.74.121","139.9.195.202","60.167.132.223"};
//        String [] ports = {"8118","1133","8118","808"};
//        System.getProperties().setProperty("http.proxyHost", ips[q%5]);
//        System.getProperties().setProperty("http.proxyPort", ports[q%5]);
//        System.err.println(ips[q%5]+":"+ports[q%5]);
            try {
                path = "http://qikan.chaoxing.com/searchjour?sw=" + URLEncoder.encode("西北农林科技大学", "utf-8") + "&stryear=17&nosim=1&size=50&x=0_900&pages=" + String.valueOf(q);
            } catch (UnsupportedEncodingException e) {
                e.printStackTrace();
            }
            q++;
            Random r = new Random();
            String[] ua = {"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
                    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36 OPR/37.0.2178.32",
                    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
                    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586",
                    "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
                    "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)",
                    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)",
                    "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)",
                    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.3 Safari/537.36",
                    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36 Core/1.47.277.400 QQBrowser/9.4.7658.400",
                    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 UBrowser/5.6.12150.8 Safari/537.36",
                    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0",
                    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36 TheWorld 7",
                    "Mozilla/5.0 (Windows NT 6.1; W…) Gecko/20100101 Firefox/60.0"};
            int k = r.nextInt(14);
            String ip = null;

            Document doc = null;

            try {
                doc = Jsoup.connect(path)
                        .timeout(10000)
                        .ignoreHttpErrors(true)
                        .userAgent(ua[k])
//                    .cookie("msign_dsr", "1592578548328")
//                    .cookie("search_uuid", "59d60d20%2d888c%2d4339%2d893c%2d3f2df0b416a9")
//                    .cookie("UM_distinctid", "172cd13faab18a-0cc175a1ccab1-3c3f5a0c-1fa400-172cd13faac61e")
//                    .cookie("__dxca", "7959aeaa-2cb3-44b4-ac6d-5a19f2ac1261")
//                    .cookie("lv", "0")
//                    .cookie("chaoxinguser", "1")
//                    .cookie("uname", "")
//                    .cookie("_uid", "143680086")
//                    .cookie("uf", "f9866f9a46b70622b0dee5ad98478a8391831cfec1654dd57a4fb73fc6375849f7893fe5a7d754cdff17dba0ec4416549b0594e13f4b452fbdd6b93a431584911471850d8bf7e34c24848f3d3f307228570bdddc4e9885c5")
//                    .cookie("_d", "1592928430294")
//                    .cookie("UID", "143680086")
//                    .cookie("vc", "A5267B905B437CE795E472D324AD65E0")
//                    .cookie("vc2", "C473E4E0C10364B7C0A31DE46070C411")
//                    .cookie("vc3", "C9JC%2Ff8LUHJ%2FbURcxo1vggUNgTmarTCVJFGMM3kP7MoczjjCxUPi89UGJjjLU4sQ6WNHQgZOUN1mj6sDYiMWNCcwVw%2BMLRe1D3y5d%2FofpoF2AMh3LToyTKUsC9ykkTfsWnWiDPL32C%2Bziy%2BnJFuaW5KB9%2FlDulddw%2B9thByI%2Bvg%3Daaeb61274732c450c6a62c27f9001cce")
//                    .cookie("xxtenc", "3fa0ee7c7d4f9d548e4bc1d7336c838c")
//                    .cookie("duxiu", "userName%5fdsr%2c%3dxbnlkjdx%2c%21userid%5fdsr%2c%3d1406%2c%21char%5fdsr%2c%3d%u78cb%2c%21metaType%2c%3d353%2c%21dsr%5ffrom%2c%3d1%2c%21logo%5fdsr%2c%3dlogo0408%2ejpg%2c%21logosmall%5fdsr%2c%3dsmall0408%2ejpg%2c%21title%5fdsr%2c%3d%u897f%u5317%u519c%u6797%u79d1%u6280%u5927%u5b66%2c%21url%5fdsr%2c%3debook%2c%21compcode%5fdsr%2c%3d1176%2c%21province%5fdsr%2c%3d%u9655%u897f%2c%21readDom%2c%3d0%2c%21isdomain%2c%3d61707%2c%21showcol%2c%3d0%2c%21hu%2c%3d0%2c%21uscol%2c%3d0%2c%21isfirst%2c%3d0%2c%21istest%2c%3d0%2c%21cdb%2c%3d0%2c%21og%2c%3d1%2c%21ogvalue%2c%3d7%2c%21testornot%2c%3d1%2c%21remind%2c%3d0%2c%21datecount%2c%3d3478%2c%21userIPType%2c%3d2%2c%21lt%2c%3d0%2c%21ttt%2c%3dfxlogin%2echaoxing%2c%21enc%5fdsr%2c%3dD5E78CB6EBD2A70090256E4C463A3566")
                        .get();
            } catch (IOException e) {
                return;
            }

            Elements subjects = doc.select("#liebiaoDivId tr");
            Elements authors = doc.select("#liebiaoDivId tr");
            Elements downloadnum = doc.select("#liebiaoDivId tr");
            Elements quotenum = doc.select("#liebiaoDivId tr");

            String[] pathes = new String[50];
            String[] quotenums = new String[50];
            String[] downloadnums = new String[50];
            String[] periodicals = new String[50];
            String[] publishtimes = new String[50];
            String[] funds = new String[50];
            String[][] keys = new String[50][8];
            String[] units = new String[50];
            String[] abstracts = new String[50];
            for (int i = 0; i < 50; i++) {
                pathes[i] = "http://qikan.chaoxing.com" + subjects.get(i + 1).child(1).child(0).attr("href").trim();
                quotenums[i] = quotenum.get(i + 1).child(6).child(0).text();
                downloadnums[i] = downloadnum.get(i + 1).child(7).child(0).text();
                periodicals[i] = downloadnum.get(i + 1).child(3).child(0).text();

                if (downloadnum.get(i + 1).child(5).child(0).text().length() == 8) {
                    publishtimes[i] = downloadnum.get(i + 1).child(5).child(0).text().substring(0, 4) + "-0" + String.valueOf(Integer.parseInt(String.valueOf(downloadnum.get(i + 1).child(5).child(0).text().charAt(6))));
                } else if (downloadnum.get(i + 1).child(5).child(0).text().length() > 8 && (Integer.valueOf(downloadnum.get(i + 1).child(5).child(0).text().charAt(6)) <= 9) && (Integer.valueOf(downloadnum.get(i + 1).child(5).child(0).text().charAt(6)) >= 0)) {

                    publishtimes[i] = downloadnum.get(i + 1).child(5).child(0).text().substring(0, 4) + "-" + String.valueOf(Integer.valueOf(downloadnum.get(i + 1).child(5).child(0).text().substring(6, 8)) / 2);
                } else {
                    publishtimes[i] = "1";
                }
                //System.out.println(publishtimes[i]);
                //System.out.println(periodicals[i]);
                //System.out.println(downloadnums[i]);
                //System.out.println(quotenums[i]);
                //System.out.println(pathes[i]);
            }

            for (int i = 0; i < 50; i++) {


                if (i % 10 == 0) {
                    try {
                        ip = xian.getip();
                    } catch (InterruptedException e) {
                        e.printStackTrace();
                    }


                    String[] r1 = ip.split(":");
                    System.out.println(ip);
                    System.getProperties().setProperty("http.proxyHost", r1[0]);
                    System.getProperties().setProperty("http.proxyPort", r1[1]);
                    System.err.println(r1[0] + ":" + r1[1]);
                }
                Document doc1 = null;


                try {
                    doc1 = Jsoup.connect(pathes[i])
                            .timeout(10000)
                            .ignoreHttpErrors(true)
                            .userAgent(ua[k])
//                        .cookie("msign_dsr", "1592578548328")
//                        .cookie("search_uuid", "59d60d20%2d888c%2d4339%2d893c%2d3f2df0b416a9")
//                        .cookie("UM_distinctid", "172cd13faab18a-0cc175a1ccab1-3c3f5a0c-1fa400-172cd13faac61e")
//                        .cookie("__dxca", "7959aeaa-2cb3-44b4-ac6d-5a19f2ac1261")
//                        .cookie("lv", "0")
//                        .cookie("chaoxinguser", "1")
//                        .cookie("uname", "")
//                        .cookie("_uid", "143680086")
//                        .cookie("uf", "f9866f9a46b70622b0dee5ad98478a8391831cfec1654dd57a4fb73fc6375849f7893fe5a7d754cdff17dba0ec4416549b0594e13f4b452fbdd6b93a431584911471850d8bf7e34c24848f3d3f307228570bdddc4e9885c5")
//                        .cookie("_d", "1592928430294")
//                        .cookie("UID", "143680086")
//                        .cookie("vc", "A5267B905B437CE795E472D324AD65E0")
//                        .cookie("vc2", "C473E4E0C10364B7C0A31DE46070C411")
//                        .cookie("vc3", "C9JC%2Ff8LUHJ%2FbURcxo1vggUNgTmarTCVJFGMM3kP7MoczjjCxUPi89UGJjjLU4sQ6WNHQgZOUN1mj6sDYiMWNCcwVw%2BMLRe1D3y5d%2FofpoF2AMh3LToyTKUsC9ykkTfsWnWiDPL32C%2Bziy%2BnJFuaW5KB9%2FlDulddw%2B9thByI%2Bvg%3Daaeb61274732c450c6a62c27f9001cce")
//                        .cookie("xxtenc", "3fa0ee7c7d4f9d548e4bc1d7336c838c")
//                        .cookie("duxiu", "userName%5fdsr%2c%3dxbnlkjdx%2c%21userid%5fdsr%2c%3d1406%2c%21char%5fdsr%2c%3d%u78cb%2c%21metaType%2c%3d353%2c%21dsr%5ffrom%2c%3d1%2c%21logo%5fdsr%2c%3dlogo0408%2ejpg%2c%21logosmall%5fdsr%2c%3dsmall0408%2ejpg%2c%21title%5fdsr%2c%3d%u897f%u5317%u519c%u6797%u79d1%u6280%u5927%u5b66%2c%21url%5fdsr%2c%3debook%2c%21compcode%5fdsr%2c%3d1176%2c%21province%5fdsr%2c%3d%u9655%u897f%2c%21readDom%2c%3d0%2c%21isdomain%2c%3d61707%2c%21showcol%2c%3d0%2c%21hu%2c%3d0%2c%21uscol%2c%3d0%2c%21isfirst%2c%3d0%2c%21istest%2c%3d0%2c%21cdb%2c%3d0%2c%21og%2c%3d1%2c%21ogvalue%2c%3d7%2c%21testornot%2c%3d1%2c%21remind%2c%3d0%2c%21datecount%2c%3d3478%2c%21userIPType%2c%3d2%2c%21lt%2c%3d0%2c%21ttt%2c%3dfxlogin%2echaoxing%2c%21enc%5fdsr%2c%3dD5E78CB6EBD2A70090256E4C463A3566")
                            .get();
                } catch (IOException e) {
                    return;
                }
                Elements key = doc1.select(".Fmian1 table tbody tr");
                System.out.println(i + 1 + ": " + pathes[i]);

                //【作者机构】
                //【来    源】
                //【分 类 号】
                //【分类导航】
                //【关 键 词】
                //【基    金】
                //【摘    要】
                //【统计数据】
                //【全文获取】
                for (Element e : key) {
                    if (e.child(0).text().equals("【关 键 词】")) {
                        for (int z = 0; z < e.child(1).children().size() && z < 8; z++) {
                            keys[i][z] = e.child(1).children().get(z).text();
                            //System.out.println(keys[i][z]);
                        }
                    }
                    if (e.child(0).text().equals("【基    金】")) {
                        funds[i] = e.child(1).text();
                        //System.out.println(funds[i]);
                    }
                    if (e.child(0).text().equals("【作者机构】")) {
                        if (e.child(1).children().size() == 0) {
                            units[i] = "西北农林科技大学黄土高原土壤侵蚀与旱地农业国家重点实验室";
                        } else {
                            units[i] = e.child(1).child(0).text();
                        }

                        //System.out.println(units[i]);
                    }
                    if (e.child(0).text().equals("【摘    要】")) {
                        abstracts[i] = e.child(1).text();
                        //System.out.println(abstracts[i]);
                    }
                    //System.out.println(e.child(0).text());
                }
                LW lw = new LW();
                lw.setSubject(subjects.get(i + 1).child(1).text());
                lw.setAuthor(authors.get(i + 1).child(2).text());
                lw.setType("cssci");
                if (quotenums[i] != null) lw.setQuote_num(quotenums[i]);
                else lw.setQuote_num("");
                if (downloadnums[i] != null) lw.setDownload_num(downloadnums[i]);
                else lw.setDownload_num("");
                lw.setPeriodical(periodicals[i]);
                lw.setPublish_time(publishtimes[i]);
                if (keys[i][0] != null) lw.setKey(keys[i][0]);
                else lw.setKey("");
                if (keys[i][1] != null) lw.setKey1(keys[i][1]);
                else lw.setKey1("");
                if (keys[i][2] != null) lw.setKey2(keys[i][2]);
                else lw.setKey2("");
                if (keys[i][3] != null) lw.setKey3(keys[i][3]);
                else lw.setKey3("");
                if (keys[i][4] != null) lw.setKey4(keys[i][4]);
                else lw.setKey4("");
                if (keys[i][5] != null) lw.setKey5(keys[i][5]);
                else lw.setKey5("");
                if (keys[i][6] != null) lw.setKey6(keys[i][6]);
                else lw.setKey6("");
                if (keys[i][7] != null) lw.setKey7(keys[i][7]);
                else lw.setKey7("");
                if (funds[i] != null) lw.setFund(funds[i]);
                else lw.setFund("");
                if (units[i] != null) lw.setUnit(units[i]);
                else lw.setUnit("");
                lw.setAbstract1(abstracts[i]);

                lws.add(lw);
                LW.lws.add(lw);
//            funds[i] = key.get(5).child(1).text();
//            System.out.println(funds[i]);


            }


            for (int i = 0; i < 50; i++) {

            }
            return;
        }
    }

2.pachong.class

package com.example.util;


import com.example.mapper.bookmapper;
import com.example.service.bookservice;
import com.example.service.serviceimpl.bookserviceimpl;
import org.apache.xmlbeans.impl.xb.xsdschema.Public;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.context.ApplicationContext;
import org.springframework.stereotype.Component;
import org.springframework.stereotype.Controller;
import org.springframework.stereotype.Service;
import org.springframework.test.context.junit4.SpringRunner;

import javax.annotation.PostConstruct;
import javax.annotation.Resource;
import java.io.*;
import java.net.*;
import java.security.PublicKey;
import java.util.ArrayList;
import java.util.Random;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
@Service
public class pachong{

    @Autowired
    bookmapper bm;

    public static void main(String[] args) throws IOException, InterruptedException {


    }

    public ArrayList<LW> palunwen(Integer q){
        String path="";
        ArrayList<LW>lws = new ArrayList<>();
//        String [] ips = {"120.79.64.147","175.148.74.121","139.9.195.202","60.167.132.223"};
//        String [] ports = {"8118","1133","8118","808"};
//        System.getProperties().setProperty("http.proxyHost", ips[q%5]);
//        System.getProperties().setProperty("http.proxyPort", ports[q%5]);
//        System.err.println(ips[q%5]+":"+ports[q%5]);
        try {
            path = "http://qikan.chaoxing.com/searchjour?sw=" + URLEncoder.encode("西北农林科技大学", "utf-8") + "&stryear=13&nosim=1&size=50&x=0_900&pages="+String.valueOf(q);
        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        }
        Random r = new Random();
        String[] ua = {"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36 OPR/37.0.2178.32",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586",
                "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
                "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)",
                "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)",
                "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.3 Safari/537.36",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36 Core/1.47.277.400 QQBrowser/9.4.7658.400",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 UBrowser/5.6.12150.8 Safari/537.36",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36 TheWorld 7",
                "Mozilla/5.0 (Windows NT 6.1; W…) Gecko/20100101 Firefox/60.0"};
        int k = r.nextInt(14);
        String ip = null;
        ip = ips1().get(0);

        Document doc = null;

        try {
            doc = Jsoup.connect(path)
                    .timeout(10000)
                    .ignoreHttpErrors(true)
                    .userAgent(ua[k])
//                    .cookie("msign_dsr", "1592578548328")
//                    .cookie("search_uuid", "59d60d20%2d888c%2d4339%2d893c%2d3f2df0b416a9")
//                    .cookie("UM_distinctid", "172cd13faab18a-0cc175a1ccab1-3c3f5a0c-1fa400-172cd13faac61e")
//                    .cookie("__dxca", "7959aeaa-2cb3-44b4-ac6d-5a19f2ac1261")
//                    .cookie("lv", "0")
//                    .cookie("chaoxinguser", "1")
//                    .cookie("uname", "")
//                    .cookie("_uid", "143680086")
//                    .cookie("uf", "f9866f9a46b70622b0dee5ad98478a8391831cfec1654dd57a4fb73fc6375849f7893fe5a7d754cdff17dba0ec4416549b0594e13f4b452fbdd6b93a431584911471850d8bf7e34c24848f3d3f307228570bdddc4e9885c5")
//                    .cookie("_d", "1592928430294")
//                    .cookie("UID", "143680086")
//                    .cookie("vc", "A5267B905B437CE795E472D324AD65E0")
//                    .cookie("vc2", "C473E4E0C10364B7C0A31DE46070C411")
//                    .cookie("vc3", "C9JC%2Ff8LUHJ%2FbURcxo1vggUNgTmarTCVJFGMM3kP7MoczjjCxUPi89UGJjjLU4sQ6WNHQgZOUN1mj6sDYiMWNCcwVw%2BMLRe1D3y5d%2FofpoF2AMh3LToyTKUsC9ykkTfsWnWiDPL32C%2Bziy%2BnJFuaW5KB9%2FlDulddw%2B9thByI%2Bvg%3Daaeb61274732c450c6a62c27f9001cce")
//                    .cookie("xxtenc", "3fa0ee7c7d4f9d548e4bc1d7336c838c")
//                    .cookie("duxiu", "userName%5fdsr%2c%3dxbnlkjdx%2c%21userid%5fdsr%2c%3d1406%2c%21char%5fdsr%2c%3d%u78cb%2c%21metaType%2c%3d353%2c%21dsr%5ffrom%2c%3d1%2c%21logo%5fdsr%2c%3dlogo0408%2ejpg%2c%21logosmall%5fdsr%2c%3dsmall0408%2ejpg%2c%21title%5fdsr%2c%3d%u897f%u5317%u519c%u6797%u79d1%u6280%u5927%u5b66%2c%21url%5fdsr%2c%3debook%2c%21compcode%5fdsr%2c%3d1176%2c%21province%5fdsr%2c%3d%u9655%u897f%2c%21readDom%2c%3d0%2c%21isdomain%2c%3d61707%2c%21showcol%2c%3d0%2c%21hu%2c%3d0%2c%21uscol%2c%3d0%2c%21isfirst%2c%3d0%2c%21istest%2c%3d0%2c%21cdb%2c%3d0%2c%21og%2c%3d1%2c%21ogvalue%2c%3d7%2c%21testornot%2c%3d1%2c%21remind%2c%3d0%2c%21datecount%2c%3d3478%2c%21userIPType%2c%3d2%2c%21lt%2c%3d0%2c%21ttt%2c%3dfxlogin%2echaoxing%2c%21enc%5fdsr%2c%3dD5E78CB6EBD2A70090256E4C463A3566")
                    .get();
        } catch (IOException e) {
            return lws;
        }

        Elements subjects = doc.select("#liebiaoDivId tr");
        Elements authors = doc.select("#liebiaoDivId tr");
        Elements downloadnum = doc.select("#liebiaoDivId tr");
        Elements quotenum = doc.select("#liebiaoDivId tr");

        String[] pathes = new String[50];
        String[] quotenums = new String[50];
        String[] downloadnums = new String[50];
        String[] periodicals = new String[50];
        String[] publishtimes = new String[50];
        String[] funds = new String[50];
        String[][] keys = new String[50][8];
        String[] units = new String[50];
        String[] abstracts = new String[50];
        for (int i = 0; i < 50; i++) {
            pathes[i] = "http://qikan.chaoxing.com" + subjects.get(i + 1).child(1).child(0).attr("href").trim();
            quotenums[i] = quotenum.get(i + 1).child(6).child(0).text();
            downloadnums[i] = downloadnum.get(i + 1).child(7).child(0).text();
            periodicals[i] = downloadnum.get(i + 1).child(3).child(0).text();

            if (downloadnum.get(i + 1).child(5).child(0).text().length() == 8) {
                publishtimes[i] = downloadnum.get(i + 1).child(5).child(0).text().substring(0, 4) + "-0" + String.valueOf(Integer.parseInt(String.valueOf(downloadnum.get(i + 1).child(5).child(0).text().charAt(6))));
            } else if (downloadnum.get(i + 1).child(5).child(0).text().length() > 8&&(Integer.valueOf(downloadnum.get(i + 1).child(5).child(0).text().charAt(6))<=9)&&(Integer.valueOf(downloadnum.get(i + 1).child(5).child(0).text().charAt(6))>=0)) {

                publishtimes[i] = downloadnum.get(i + 1).child(5).child(0).text().substring(0, 4) + "-" + String.valueOf(Integer.valueOf(downloadnum.get(i + 1).child(5).child(0).text().substring(6, 8)) / 2);
            }
            else{
                publishtimes[i] = "1";
            }
            //System.out.println(publishtimes[i]);
            //System.out.println(periodicals[i]);
            //System.out.println(downloadnums[i]);
            //System.out.println(quotenums[i]);
            //System.out.println(pathes[i]);
        }
        int num=0;
        ArrayList<String> ipa = ips1();
        for (int i = 0; i < 50; i++) {
            if(num>195){
                ipa = ips1();
            }
            if(i%10==0){
                ip = ipa.get(num);
                num++;
                String[] r1 = ip.split(":");
                System.out.println(ip);
                System.getProperties().setProperty("http.proxyHost", r1[0]);
                System.getProperties().setProperty("http.proxyPort", r1[1]);
                System.err.println(r1[0]+":"+r1[1]);
            }
            Document doc1 = null;


            try {
                doc1 = Jsoup.connect(pathes[i])
                        .timeout(10000)
                        .ignoreHttpErrors(true)
                        .userAgent(ua[k])
//                        .cookie("msign_dsr", "1592578548328")
//                        .cookie("search_uuid", "59d60d20%2d888c%2d4339%2d893c%2d3f2df0b416a9")
//                        .cookie("UM_distinctid", "172cd13faab18a-0cc175a1ccab1-3c3f5a0c-1fa400-172cd13faac61e")
//                        .cookie("__dxca", "7959aeaa-2cb3-44b4-ac6d-5a19f2ac1261")
//                        .cookie("lv", "0")
//                        .cookie("chaoxinguser", "1")
//                        .cookie("uname", "")
//                        .cookie("_uid", "143680086")
//                        .cookie("uf", "f9866f9a46b70622b0dee5ad98478a8391831cfec1654dd57a4fb73fc6375849f7893fe5a7d754cdff17dba0ec4416549b0594e13f4b452fbdd6b93a431584911471850d8bf7e34c24848f3d3f307228570bdddc4e9885c5")
//                        .cookie("_d", "1592928430294")
//                        .cookie("UID", "143680086")
//                        .cookie("vc", "A5267B905B437CE795E472D324AD65E0")
//                        .cookie("vc2", "C473E4E0C10364B7C0A31DE46070C411")
//                        .cookie("vc3", "C9JC%2Ff8LUHJ%2FbURcxo1vggUNgTmarTCVJFGMM3kP7MoczjjCxUPi89UGJjjLU4sQ6WNHQgZOUN1mj6sDYiMWNCcwVw%2BMLRe1D3y5d%2FofpoF2AMh3LToyTKUsC9ykkTfsWnWiDPL32C%2Bziy%2BnJFuaW5KB9%2FlDulddw%2B9thByI%2Bvg%3Daaeb61274732c450c6a62c27f9001cce")
//                        .cookie("xxtenc", "3fa0ee7c7d4f9d548e4bc1d7336c838c")
//                        .cookie("duxiu", "userName%5fdsr%2c%3dxbnlkjdx%2c%21userid%5fdsr%2c%3d1406%2c%21char%5fdsr%2c%3d%u78cb%2c%21metaType%2c%3d353%2c%21dsr%5ffrom%2c%3d1%2c%21logo%5fdsr%2c%3dlogo0408%2ejpg%2c%21logosmall%5fdsr%2c%3dsmall0408%2ejpg%2c%21title%5fdsr%2c%3d%u897f%u5317%u519c%u6797%u79d1%u6280%u5927%u5b66%2c%21url%5fdsr%2c%3debook%2c%21compcode%5fdsr%2c%3d1176%2c%21province%5fdsr%2c%3d%u9655%u897f%2c%21readDom%2c%3d0%2c%21isdomain%2c%3d61707%2c%21showcol%2c%3d0%2c%21hu%2c%3d0%2c%21uscol%2c%3d0%2c%21isfirst%2c%3d0%2c%21istest%2c%3d0%2c%21cdb%2c%3d0%2c%21og%2c%3d1%2c%21ogvalue%2c%3d7%2c%21testornot%2c%3d1%2c%21remind%2c%3d0%2c%21datecount%2c%3d3478%2c%21userIPType%2c%3d2%2c%21lt%2c%3d0%2c%21ttt%2c%3dfxlogin%2echaoxing%2c%21enc%5fdsr%2c%3dD5E78CB6EBD2A70090256E4C463A3566")
                        .get();
            } catch (IOException e) {
                return lws;
            }
            Elements key = doc1.select(".Fmian1 table tbody tr");
            System.out.println(i+1+": "+pathes[i]);

            //【作者机构】
            //【来    源】
            //【分 类 号】
            //【分类导航】
            //【关 键 词】
            //【基    金】
            //【摘    要】
            //【统计数据】
            //【全文获取】
            for(Element e:key){
                if(e.child(0).text().equals("【关 键 词】")){
                    for (int z = 0; z < e.child(1).children().size() && z < 8; z++) {
                        keys[i][z] = e.child(1).children().get(z).text();
                        //System.out.println(keys[i][z]);
                    }
                }
                if(e.child(0).text().equals("【基    金】")){
                    funds[i] = e.child(1).text();
                    //System.out.println(funds[i]);
                }
                if(e.child(0).text().equals("【作者机构】")){
                    if(e.child(1).children().size()==0){
                        units[i] = "西北农林科技大学黄土高原土壤侵蚀与旱地农业国家重点实验室";
                    }
                    else{
                        units[i] = e.child(1).child(0).text();
                    }

                    //System.out.println(units[i]);
                }
                if(e.child(0).text().equals("【摘    要】")){
                    abstracts[i] = e.child(1).text();
                    //System.out.println(abstracts[i]);
                }
                //System.out.println(e.child(0).text());
            }
//            funds[i] = key.get(5).child(1).text();
//            System.out.println(funds[i]);


        }


        for (int i = 0; i < 50; i++) {
            LW lw = new LW();
            lw.setSubject(subjects.get(i + 1).child(1).text());
            lw.setAuthor(authors.get(i + 1).child(2).text());
            lw.setType("cssci");
            if (quotenums[i] != null) lw.setQuote_num(quotenums[i]);
            else lw.setQuote_num("");
            if (downloadnums[i] != null) lw.setDownload_num(downloadnums[i]);
            else lw.setDownload_num("");
            lw.setPeriodical(periodicals[i]);
            lw.setPublish_time(publishtimes[i]);
            if (keys[i][0] != null) lw.setKey(keys[i][0]);
            else lw.setKey("");
            if (keys[i][1] != null) lw.setKey1(keys[i][1]);
            else lw.setKey1("");
            if (keys[i][2] != null) lw.setKey2(keys[i][2]);
            else lw.setKey2("");
            if (keys[i][3] != null) lw.setKey3(keys[i][3]);
            else lw.setKey3("");
            if (keys[i][4] != null) lw.setKey4(keys[i][4]);
            else lw.setKey4("");
            if (keys[i][5] != null) lw.setKey5(keys[i][5]);
            else lw.setKey5("");
            if (keys[i][6] != null) lw.setKey6(keys[i][6]);
            else lw.setKey6("");
            if (keys[i][7] != null) lw.setKey7(keys[i][7]);
            else lw.setKey7("");
            if (funds[i] != null) lw.setFund(funds[i]);
            else lw.setFund("");
            if (units[i] != null) lw.setUnit(units[i]);
            else lw.setUnit("");
            lw.setAbstract1(abstracts[i]);

            lws.add(lw);

        }
        return lws;
    }
    public ArrayList<String> ips() throws IOException {
        String path = "http://api.xiequ.cn/VAD/GetIp.aspx?act=get&num=100&time=30&plat=1&re=1&type=0&so=1&ow=1&spl=1&addr=&db=1";// 要获得html页面内容的地址

        URL url = new URL(path);// 创建url对象

        HttpURLConnection conn = (HttpURLConnection) url.openConnection();// 打开连接
        conn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
        conn.setRequestProperty("contentType", "GBK"); // 设置url中文参数编码

        conn.setConnectTimeout(5 * 1000);// 请求的时间

        conn.setRequestMethod("GET");// 请求方式

        InputStream inStream = conn.getInputStream();
        // readLesoSysXML(inStream);

        BufferedReader in = new BufferedReader(new InputStreamReader(inStream, "GBK"));
        StringBuffer buffer = new StringBuffer();
        ArrayList<String> ipp = new ArrayList<String>();
        String line = "";
        // 读取获取到内容的最后一行,写入
        while ((line = in.readLine()) != null) {
            buffer.append(line);
            ipp.add(line);
            System.out.println(line);
        }
        String str = buffer.toString();
//    JSONObject json1 = JSONObject.parseObject(str);
//    JSONArray jsons =  JSONArray.parseArray(json1.get("data").toString());

//    for(Object json:jsons){
//        JSONObject ips = JSONObject.parseObject(json.toString());
//        String ip = ips.get("IP").toString();
//        System.out.println(ip);
//        ipp.add(ip);
//    }
        return ipp;

    }

    public ArrayList<String> ips1(){
        String path = "http://119.45.8.232/Api/?k=NRAX9VIN451I35QJP36SEM&num=1&type=1&f=1&repeat=1&respone=0&ptn=1";// 要获得html页面内容的地址

        URL url = null;// 创建url对象
        try {
            url = new URL(path);
        } catch (MalformedURLException e) {
            ArrayList<String>s = new ArrayList<>();
            s.add("1.1.1.1:80");
            return s;
        }

        HttpURLConnection conn = null;// 打开连接
        try {
            conn = (HttpURLConnection) url.openConnection();
        } catch (IOException e) {
            ArrayList<String>s = new ArrayList<>();
            s.add("58.218.200.248:22069");
            return s;
        }
        conn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
        conn.setRequestProperty("contentType", "utf-8"); // 设置url中文参数编码

        conn.setConnectTimeout(5 * 1000);// 请求的时间

        try {
            conn.setRequestMethod("GET");// 请求方式
        } catch (ProtocolException e) {
            ArrayList<String>s = new ArrayList<>();
            s.add("1.1.1.1:80");
            return s;
        }

        InputStream inStream = null;
        try {
            inStream = conn.getInputStream();
        } catch (IOException e) {
            ArrayList<String>s = new ArrayList<>();
            s.add("1.1.1.1:80");
            return s;
        }
        // readLesoSysXML(inStream);

        BufferedReader in = null;
        try {
            in = new BufferedReader(new InputStreamReader(inStream, "utf-8"));
        } catch (UnsupportedEncodingException e) {
            ArrayList<String>s = new ArrayList<>();
            s.add("1.1.1.1:80");
            return s;
        }
        StringBuffer buffer = new StringBuffer();
        ArrayList<String> ipp = new ArrayList<String>();
        String line = "";
        // 读取获取到内容的最后一行,写入
        while (true) {
            try {
                if (!((line = in.readLine()) != null)) break;
            } catch (IOException e) {
                ArrayList<String>s = new ArrayList<>();
                s.add("1.1.1.1:80");
                return s;
            }
            buffer.append(line);
            ipp.add(line);
            System.out.println(line);
        }
        String str = buffer.toString();
//    JSONObject json1 = JSONObject.parseObject(str);
//    JSONArray jsons =  JSONArray.parseArray(json1.get("data").toString());

//    for(Object json:jsons){
//        JSONObject ips = JSONObject.parseObject(json.toString());
//        String ip = ips.get("IP").toString();
//        System.out.println(ip);
//        ipp.add(ip);
//    }
        return ipp;

    }

    public void xiancheng() throws IOException, InterruptedException {
        ExecutorService pool = Executors.newCachedThreadPool();

//        ArrayList<String>ips = new ArrayList<>();
//        try {
//            File file = new File("C:/IMAGES/ip.txt");
//            BufferedReader bufferedReader = new BufferedReader(new FileReader(file));
//            String strLine = null;
//            int lineCount = 1;
//            while(null != (strLine = bufferedReader.readLine())){
//                System.err.println(strLine);
//                ips.add(strLine);
//                lineCount++;
//            }
//        }catch(Exception e){
//            e.printStackTrace();
//        }

        ArrayList ips = ips();
        for (int i = 0; i < 1500; i++) {
            xiancheng xian = new xiancheng(ips, i);
            xian.run();
            //pool.execute(xian);
            for(LW lw:xian.lws) {
                if (!bm.searchlw(lw.getSubject())) {
                    if (bm.lwcount() == null) {
                        lw.setId(String.valueOf("1"));
                    } else {
                        lw.setId(String.valueOf(Integer.valueOf(bm.lwcount()) + 1));
                    }
                }
                bm.insertlw(lw);
                System.out.println(lw.toString());
            }

        }

    }
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值