前言:
jsoup 是一款Java 的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。
设置批量的代理ip进行请求连接,可以模拟爬虫进行元素的简单抓取。
1.pom依赖
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.7.3</version>
</dependency>
2.客户端:
/**
* @author DHing
*/
public class JsoupClient {
private static final Logger log = Logger.getLogger("Jsoup");
private static volatile JsoupClient instance = null;
static String url="http://www.credit-manage.com/search.htm";
public static JsoupClient getInstance() {
if(instance==null){
synchronized (JsoupClient.class) {
if(instance==null)
instance = new JsoupClient();
}
}
return instance;
}
public static Document httpPost(String url,Map<String,String> map,String cookie) throws IOException{
//获取请求连接
//设置代理ip
JsoupClient.setProxyIp();
Connection con = Jsoup.connect(url).userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.64 Safari/537.31");
//遍历生成参数
if(map!=null){
for (Entry<String, String> entry : map.entrySet()) {
//添加参数
con.data(entry.getKey(), entry.getValue());
}
}
//插入cookie(头文件形式)
con.header("Cookie", cookie);
Document doc = con.post();
//System.out.println("doc=="+doc);
return doc;
}
/**
* 获取指定HTML 文档指定的body
* @throws IOException
*/
private static void BolgBody() throws IOException {
// 直接从字符串中输入 HTML 文档
String html = "<html><head><title> 开源中国社区 </title></head>"
+ "<body><p> 这里是 jsoup 项目的相关文章 </p></body></html>";
Document doc = Jsoup.parse(html);
System.out.println(doc.body());
// 从 URL 直接加载 HTML 文档
Document doc2 = Jsoup.connect(url).get();
String title = doc2.body().toString();
System.out.println(title);
}
/**
* 获取文章标题和链接
*/
public static void article() {
Document doc;
try {
doc = Jsoup.connect(url).get();
Elements ListDiv = doc.getElementsByAttributeValue("class","postTitle");
for (Element element :ListDiv) {
Elements links = element.getElementsByTag("a");
for (Element link : links) {
String linkHref = link.attr("href");
String linkText = link.text().trim();
System.out.println(linkHref);
System.out.println(linkText);
}
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
* 获取指定文章的内容
*/
public static void Blog() {
Document doc;
try {
doc = Jsoup.connect(url).get();
Elements ListDiv = doc.getElementsByAttributeValue("class","postBody");
for (Element element :ListDiv) {
System.out.println(element.html());
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
* 设置代理ip
* @throws IOException
*/
public static void setProxyIp() {
try {
List<String> ipList = new ArrayList<>();
BufferedReader proxyIpReader = new BufferedReader(new InputStreamReader(JsoupClient.class.getResourceAsStream("/proxyip.txt")));
String ip = "";
while((ip = proxyIpReader.readLine()) != null) {
ipList.add(ip);
}
Random random = new Random();
int randomInt = random.nextInt(ipList.size());
String ipport = ipList.get(randomInt);
String proxyIp = ipport.substring(0, ipport.lastIndexOf(":"));
String proxyPort = ipport.substring(ipport.lastIndexOf(":") + 1, ipport.length());
System.setProperty("http.maxRedirects", "50");
System.getProperties().setProperty("proxySet", "true");
System.getProperties().setProperty("http.proxyHost", proxyIp);
System.getProperties().setProperty("http.proxyPort", proxyPort);
System.out.println("设置代理ip为:" + proxyIp + "端口号为:" + proxyPort);
} catch (Exception e) {
System.out.println("重新设置代理ip");
setProxyIp();
}
}
/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
Map<String,String> map = new HashMap<String,String>();
map.put("condition", "371326199103250818");
JsoupClient.httpPost(url, map, "");
// TODO Auto-generated method stub
BolgBody();
Blog();
/*
* Document doc = Jsoup.connect("http://www.oschina.net/")
* .data("query", "Java") // 请求参数 .userAgent("I ’ m jsoup") // 设置
* User-Agent .cookie("auth", "token") // 设置 cookie .timeout(3000) //
* 设置连接超时时间 .post();
*/// 使用 POST 方法访问 URL
/*
* // 从文件中加载 HTML 文档 File input = new File("D:/test.html"); Document doc
* = Jsoup.parse(input,"UTF-8","http://www.oschina.net/");
*/
}
对于jsoup里的方法可以参照文档: http://www.open-open.com/jsoup/