Jsoup 爬数据测试

import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.security.SecureRandom;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSession;
import javax.net.ssl.X509TrustManager;

import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.ysccc.tools.Emptys;

public class DataBugger {

	/**
	 * 信任任何站点,实现https页面的正常访问
	 * 
	 */
	public static void trustEveryone() {
		try {
			HttpsURLConnection.setDefaultHostnameVerifier(new HostnameVerifier() {
				public boolean verify(String hostname, SSLSession session) {
					return true;
				}
			});

			SSLContext context = SSLContext.getInstance("TLS");
			context.init(null, new X509TrustManager[] { new X509TrustManager() {
				public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {
				}

				public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {
				}

				public X509Certificate[] getAcceptedIssuers() {
					return new X509Certificate[0];
				}
			} }, new SecureRandom());
			HttpsURLConnection.setDefaultSSLSocketFactory(context.getSocketFactory());
		} catch (Exception e) {
			// e.printStackTrace();
		}
	}

	public static void main(String[] args) throws IOException, InterruptedException {
		trustEveryone();
		Map<String, String> conn = new LinkedHashMap<>();
		conn.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
		conn.put("Accept-Encoding", "gzip, deflate, br");
		conn.put("Accept-Language", "zh-CN,zh;q=0.9");
		conn.put("Cache-Control", "max-age=0");
		conn.put("Connection", "keep-alive");
		conn.put("Host", "blog.maxleap.cn");
		conn.put("Upgrade-Insecure-Requests", "1");
		conn.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36");
		
		StringBuilder buf = new StringBuilder();
		for (int i = 0; i < 16; i++) {
			System.out.println("当前页码" + i);
			Thread.sleep(1000);
			Document doc = Jsoup.connect("https://waimao.mingluji.com/%E5%9B%9B%E5%B7%9D?page=" + i).headers(conn).get();
			Elements spans = doc.select("div.view-content span.field-content a");
			for (Element span : spans) {
				buf.append("https://waimao.mingluji.com").append(span.attr("href")).append("\r\n");
			}
			FileUtils.write(new File("d:\\address.txt"), buf.toString(), Charset.forName("utf-8"));
		}
		
		List<String> lines = FileUtils.readLines(new File("d:\\address.txt"), Charset.forName("utf-8"));
		StringBuilder sbuf = new StringBuilder();
		int i = 0;
		int size = lines.size();
		for (String url : lines) {
			System.out.println("当前企业:" + (i++) + ",企业总数:" + size);
			Thread.sleep(1000);
			if (Emptys.isNotEmpty(url) && StringUtils.contains(url, "https://")) {
				Document doc = Jsoup.connect(url).headers(conn).get();
				Elements spans = doc.select("div.content fieldset span.field-item span");
				for (Element span : spans) {
					sbuf.append(span.text()).append("\t");
				}
				sbuf.append("\r\n");
			}
		}
		FileUtils.write(new File("d:\\merchant.txt"), sbuf.toString(), Charset.forName("utf-8"));
	}
}

转载于:https://my.oschina.net/u/1261213/blog/2243441

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值