需求:
爬取出所有号段,并找到他们对应的运营商和所属地。
添加依赖:
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.9.2</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.3</version>
</dependency>
实现代码:
package com.deeplinkJavaSpider.MainPageSpider;
import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.*;
public class numberRegex {
public static String outputPath="C:\\Users\\lenovo\\Desktop\\a1.txt";
/**
*
* @param urlString
* @return
* @throws IOException
*/
public static StringBuilder openUrl(String urlString) throws IOException {
URL url = new URL(urlString);
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
BufferedReader in = null;
String userAgent = getUserAgent(); //随机获取ua
StringBuilder result = new StringBuilder();
try {
// 设置通用的请求属性
connection.setRequestProperty("accept", "*/*");
connection.setRequestProperty("connection", "Keep-Alive");
connection.setRequestProperty("user-agent", userAgent);
// 建立实际的连接
connection.connect();
// 定义 BufferedReader输入流来读取URL的响应
in = new BufferedReader(new InputStreamReader(
connection.getInputStream(), "gbk")); //设置编码,如果不设置应该是utf8
String line;
while ((line = in.readLine()) != null) {
result.append(line);
}
} finally { // 使用finally块来关闭输入流
try {
if (in != null) {
in.close();
}
} catch (Exception e2) {
e2.printStackTrace();
}
}
return result;
}
/**
*
* @return 获得随机选择的UA
* @throws IOException
*/
public static String getUserAgent() throws IOException {
List<String> list = new ArrayList<>();
InputStreamReader reader = new InputStreamReader(new FileInputStream("src//main//resources//user-Agent.txt"));
BufferedReader bufferedReader = new BufferedReader(reader);
String lineTxt = null;
while ((lineTxt = bufferedReader.readLine()) != null) {
list.add(lineTxt);
}
reader.close();
Random random = new Random();
String userAgent = list.get(random.nextInt(list.size()));
return userAgent;
}
/**
*
* @param urlString url字符串
* @throws Exception
*/
public static void doFetchPage(String urlString) throws Exception {
StringBuilder result=openUrl(urlString);
spiderMainPage(result.toString());
}
/**
*
* @param html 爬取到的主页面信息
* @throws Exception
*/
public static void spiderMainPage(String html) throws Exception {
Map<String, String> map = new HashMap<>();
Document doc = Jsoup.parse(html);
Elements elements1 = doc.select("div.fkce");
Elements elements2 = elements1.select("div.fkt");
for (Element e : elements2) {
String province = e.select("div.fkbj").text();
Elements elements3 = e.select("div.fklk").select("a");
for (Element c : elements3) {
String city = c.text();
String href = c.attr("href"); //爬取到div.fklk下的所有链接
map.put(province + "-" + city, href);
}
}
doFetchNextPage(map);
}
/**
*
* @param map 以省份-城市为key 对应的链接为value
* @throws Exception
*/
public static void doFetchNextPage(Map<String, String> map) throws Exception {
for (Map.Entry<String, String> entry : map.entrySet()) {
StringBuilder result=openUrl(entry.getValue());
spiderDeepLinkPage(result.toString(), entry.getKey());
}
}
/**
*
* @param html 针对链接爬取到的所有信息
* @param province 省份-城市 信息
* @throws Exception
*/
public static void spiderDeepLinkPage(String html, String province) throws Exception {
Map<String, String> map = new HashMap<>();
List<String> listKey = new ArrayList<>();
List<String> listValue = new ArrayList<>();
Document doc = Jsoup.parse(html);
Elements elements1 = doc.select("body > div.all>ul");
Elements elements2 = doc.select("body > div.all >div>div.num_bg ");
for (Element element : elements1) {
listKey.add(element.text());
}
for (Element element : elements2) {
listValue.add(element.text().substring(3, 5) + "-" + province);
}
for(int i=0;i<listKey.size();i++){ //合并两个list称为一个map ,使用Java8的lambda 表达式更好
map.put(listKey.get(i),listValue.get(i));
}
writeInfo(map);
}
/**
*
* @param map key为号段组成的字符串 value 为 运营商-省份-城市
* @throws Exception
*/
public static void writeInfo(Map<String,String> map) throws Exception {
StringBuffer sb = new StringBuffer();
File outputFile = new File(outputPath);
BufferedWriter bufferedWriter=null;
if(outputFile.exists()) {
bufferedWriter = new BufferedWriter(new FileWriter(outputFile,true));
}else {
bufferedWriter = new BufferedWriter(new FileWriter(outputFile));
}
for(Map.Entry<String,String> entry:map.entrySet()){
String[] str=entry.getKey().split(" ");
for(String s:str){
if(StringUtils.isNotBlank(s)) {
bufferedWriter.write(s+":"+entry.getValue()+"\n");
}
}
}
bufferedWriter.close();
}
public static void main(String[] args) throws Exception {
doFetchPage("http://www.51hao.cc/");
}
}
结果截图:
本代码中建立连接时采用的是 HttpURLConnection connection = (HttpURLConnection) url.openConnection();
除了HttpIURLConnection ,还可以采用HttpClient.
HttpClient建立连接代码:
public static HttpResponse doPostConnect(HttpClient httpClient, String url, String params, String encoding) throws Exception {
// 建立HTTP Post连线
HttpPost httpPost = new HttpPost(url);
// 设置post请求的参数
//post.setHeader("MaxRTBversion", "1.0");
// 设置post中的参数
StringEntity paramsEntity = new StringEntity(params,encoding);
paramsEntity.setContentType("application/x-www-form-urlencoded");
httpPost.setEntity(paramsEntity);
HttpResponse res = httpClient.execute(httpPost);
return res;//HttpResponse 类型
}