/**
* 基础实现对网页字段的抓取
* Created by 耿直 on 2017/11/29.
*/
public class Html {
public Document getHtmlTextByUrl(String url) {
Document doc = null;
int i = (int) (Math.random() * 1000);
while (i != 0) {
i--;
}
try {
doc = Jsoup.connect(url).data("query", "Java").userAgent("Mozilla").cookie("auth", "token").timeout(300000).post();
} catch (IOException e) {
e.printStackTrace();
}
try {
doc = Jsoup.connect(url).timeout(500000).post();
} catch (IOException e) {
e.printStackTrace();
}
return doc;
}
/**
* 获取树节点<tr><td></td></tr>
*
* @param doc
* @param className
* @return
*/
public Elements getElementByClass(Document doc, String className) {
Elements elements = null;
elements = doc.select(className);
return elements;
}
/**
* 调用
*
* @param name
* @param url 网址
* @param type provincertr
* @return
*/
public ArrayList getProvince(String name, String url, String type) {
ArrayList result = new ArrayList();
String classType = "." + type;
Document doc = this.getHtmlTextByUrl(url);
if (doc != null) {
Elements elements = this.getElementByClass(doc, classType);
for (Element item : elements) {
if (item != null) {
for (Element items : item.children()) {
String[] str = new String[4];
if (items.children().first() != null) {
// str[0] = url;
str[1] = items.children().first().ownText();
String ownUrl = items.children().first().attr("href");
str[2] = ownUrl;
str[3] = type;
result.add(str);
}
}
}
}
}
return result;
}
}
java简单爬虫小demo爬信息为主
最新推荐文章于 2022-07-18 20:25:14 发布