package com.test.demo.net;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.net.URLEncoder;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class KeyWordsSearchUtil {
/**
* 根据关键词查询论坛所需信息map
*
* @param KeyWord
* 传入关键词
* @return
*/
public static List<Map<String, Object>> findByKeyWord(String KeyWord) {
List<Map<String, Object>> postsList = new ArrayList<Map<String, Object>>();
Map<String, Object> postsOneMap = null;
try {
Document doc = Jsoup.connect(
"http://club.pchome.net/forum_1_15____md__1_"
+ URLEncoder.encode(KeyWord, "utf-8") + ".html")
.data("query", "Java").userAgent("Mozilla").cookie("auth",
"token").timeout(10000).ignoreContentType(true)
.post();
Elements postsLs = doc.select("li.i2").not(".h-bg");
if (postsLs != null && postsLs.size() > 0) {
for (Element childPost : postsLs) {
postsOneMap = new HashMap<String, Object>();
postsOneMap = new HashMap<String, Object>();
postsOneMap.put("postsPopularity", childPost.select(
"li > span.n2").first().text());
postsOneMap.put("postsTitle", childPost.select(
"span.n3 > a").attr("title"));
postsOneMap.put("postsFloor", childPost.select("span.n4")
.first().text());
postsOneMap.put("postsCname", childPost.select(
"a.bind_hover_card").first().text());
postsOneMap.put("postsCtime", childPost.select(
"li > span.n6").first().text());
postsOneMap.put("postsUrl", "http://club.pchome.net"
+ childPost.select("span.n3 a").attr("href"));
postsOneMap.put("postsContents",
getContentsByUrl("http://club.pchome.net"
+ childPost.select("span.n3 a").attr(
"href")));
postsList.add(postsOneMap);
}
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return postsList;
}
/**
* 根据帖子的url获取帖子的文本内容
*
* @param url
* 帖子的路径
* @return
*/
public static String getContentsByUrl(String url) {
String contents = "11";
try {
Document doc = Jsoup.connect(url).data("query", "Java").userAgent(
"Mozilla").cookie("auth", "token").timeout(10000)
.ignoreHttpErrors(true).post();
if (doc.select("div.mc").first() != null) {
Element contentsEle = doc.select("div.mc div").first();
if (contents.contains("[向左转] [向右转] [原图]")) {
contents = contents.replace("[向左转] [向右转] [原图]", "");
}
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return contents;
}
public static void main(String[] args) throws Exception {
List<Map<String, Object>> postsList = KeyWordsSearchUtil
.findByKeyWord("音乐");
System.out.println("http://club.pchome.net/forum_1_15____md__1_"
+ java.net.URLEncoder.encode("音乐", "utf-8") + ".html");
System.out.println(postsList.size() + "/");
for (int i = 0; i < postsList.size(); i++) {
for (Map.Entry<String, Object> entry : postsList.get(i).entrySet()) {
System.out.println("key=" + entry.getKey() + "| value="
+ entry.getValue());
}
System.out.println("-------------------------");
}
}
}
转载于:https://my.oschina.net/u/2526634/blog/546748