Jsoup爬虫 demo

pom.xml文件添加下面的内容

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>webContent</groupId>
<artifactId>com.xly.webContent</artifactId>
<version>0.0.1-SNAPSHOT</version>
<repositories>
<repository>
<id>com.springsource.repository.bundles.release</id>
<name>EBR Spring Release Repository</name>
<url>http:// repository.springsource.com/maven/bundles/release</url>
</repository>
<repository>
<id>com.springsource.repository.bundles.external</id>
<name>EBR External Release Repository</name>
<url>http:// repository.springsource.com/maven/bundles/external</url>
</repository>
</repositories>
<properties>
<org.springframework.version>3.0.5.RELEASE</org.springframework.version>
</properties>
<dependencies>
<dependency>
<!-- jsoup HTML parser library @ http://jsoup.org/ -->
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.5.2</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-core</artifactId>
<version>${org.springframework.version}</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.6.1</version>
</dependency>

</dependencies>
</project>


处理逻辑

package com.xly.jsoup;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.xly.jsoup.bean.WebInfoBean;
/**
*
*
* @author Kaikai
* @version $Id: WebContentMain.java, v 0.1 2014-10-26 上午9:49:32 Kaikai Exp $
*/
public class WebContentMain {

public static final String BASE_URL="";

public static final Logger log = LoggerFactory.getLogger(WebContentMain.class);

static String base_url = "http://finance.sina.com.cn/";
static String base_info_url="http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_AllNewsStock/symbol/";//sh600158.phtml
static String sub_div_name="artibody";
/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
getDatelistBean(base_info_url+"sh600158.phtml","datelist");
}

/**
* 从doc中获取<div class="datelist">
* 对html的处理逻辑匹配新浪财经的url结构
* @param doc
*/
private static void getDatelistBean(String url,String divname) throws IOException
{
Document doc = Jsoup.connect(url).get();
Elements el = doc.select("div[class="+divname+"]");
Elements urls = el.select("a[href]");
List<WebInfoBean> list = new ArrayList<WebInfoBean>();
for(Element e:urls){
WebInfoBean bean = new WebInfoBean();
Attributes attr= e.attributes();
bean.setUrl(attr.get("href"));
bean.setBaseUrl(e.baseUri());
bean.setTitle(e.childNode(0).toString());
try {
String[] tmp = bean.getUrl().split("/");
int lenght = tmp.length;
if(lenght>4)bean.setTime(tmp[lenght-2]+tmp[lenght-1].substring(0,4));
} catch (Exception e1) {
System.out.println(bean.getUrl());
}
bean.setContent(extContent(bean.getUrl(),sub_div_name));
list.add(bean);
}
save(list);

}

/**
* 解析内容
*
* @param url
* @param divname
* @return
* @throws IOException
*/
private static String extContent(String url,String divname) throws IOException{
Document doc = Jsoup.connect(url).get();
Elements el = doc.select("div[id="+divname+"]");
Elements ps = el.select("p");
String infoStr = "";
for(Element e:ps){
infoStr=infoStr+e.text()
// +"\n"
;
}
return infoStr;
}

private static void save(List<WebInfoBean> list){
for(WebInfoBean bean:list){
System.out.println(bean.toString());
}
}

}


bean
public class WebInfoBean {

private String id;

private String title;
private String index;
private String time;
private String content;
private String baseUrl;
private String url;

public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getIndex() {
return index;
}
public void setIndex(String index) {
this.index = index;
}
public String getTime() {
return time;
}
public void setTime(String time) {
this.time = time;
}
public String getBaseUrl() {
return baseUrl;
}
public void setBaseUrl(String baseUrl) {
this.baseUrl = baseUrl;
}
@Override
public String toString() {
return "WebInfoBean [id=" + id + ", title=" + title + ", index=" + index + ", time=" + time
+ ", content=" + content + ", baseUrl=" + baseUrl + ", url=" + url + "]";
}


}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值