Stock2

如果页面的编码是gbk的话, 需要在输入流中做转换
new InputStreamReader(conn.getInputStream(),"gbk")

下载天风证券的信息

public class News {
private String name;
private String url;
private String date;
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getDate() {
return date;
}
public void setDate(String date) {
this.date = date;
}
@Override
public String toString() {
return "News [name=" + name + ", url=" + url + ", date=" + date + "]";
}


}


import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.cup.cupsecure.util.lang.StringUtils;

public class Stock2 {

public static int startPage = 1;
public static int endPage = 3;

public static void main(String[] args) throws Exception {
m5();
}

public static void m5() throws Exception {
for(int i=startPage; i<=endPage; i++){
System.out.println(i+"----------------------");
m4(i);
}
}

public static void m4(int page) throws Exception {
String temp = "";
List<News> list = new ArrayList<News>();
HttpURLConnection conn = (HttpURLConnection) new URL(v1+page).openConnection();
BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream()));
while ((temp = br.readLine()) != null) {
if (temp.contains("<li><a href=\"/detail.aspx")) {

News news = new News();
String t1 = temp.trim();
String date = "";
String name = "";
String url = "";
// <li><a href="/detail.aspx?cid=24435">城农商行扎堆冲锋上市 贷款成本风险较高成障碍
// </a><span>2012-01-06</span></li>
for (int i = 0; i < t1.length() - 5; i++) {
if ((t1.charAt(i) == '<' && t1.charAt(i + 1) == '/')
&& (t1.charAt(i + 2) == 's' && t1.charAt(i + 3) == 'p'
&& t1.charAt(i + 4) == 'a' && t1.charAt(i + 5) == 'n')) {
date = t1.substring(i - 10, i);
news.setDate(date);
}
// </a>
if ((t1.charAt(i) == '<' && t1.charAt(i + 1) == '/')
&& (t1.charAt(i + 2) == 'a' && t1.charAt(i + 3) == '>')) {
name = t1.substring(37, i);
name = name.replace("\"", "");
name = name.replace(":", " ");
name = name.replace(":", " ");
name = name.replace("“", " ");
name = name.replace("”", " ");
name = name.replace("?", " ");
name = name.replace("<", " ");
name = name.replace(">", " ");
name = name.replace("*", " ");
name = name.replaceAll("","");
news.setName(name);
}
// ">
if ((t1.charAt(i) == '"' && t1.charAt(i + 1) == '>')) {
url = t1.substring(14, i);
news.setUrl(url);
}
}
list.add(news);
// if (list.size() > 0) {
// break;
// }
}
}

for (News news : list) {
String dateFolder = news.getDate().substring(0,7);
File existFolder = new File("D:/stock/stockNews/"+dateFolder);
if(!existFolder.isDirectory()){
existFolder.mkdirs();
}

File fs2 = new File("D:/stock/stockNews/"+dateFolder+"/"+news.getDate()+" "+news.getName()+".txt");
BufferedWriter bw = null;
try{
bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fs2),
"UTF-8"));
}catch(FileNotFoundException e){
File fs3 = new File("D:/stock/stockNews/"+news.getDate()+" XXX.txt");
bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fs3),
"UTF-8"));
e.printStackTrace();
}


HttpURLConnection conn2 = (HttpURLConnection) new URL(v2 + news.getUrl())
.openConnection();
BufferedReader br2 = new BufferedReader(new InputStreamReader(conn2.getInputStream()));
String temp2 = "";
boolean b = false;
String content = "";
while ((temp2 = br2.readLine()) != null) {
if (temp2.contains("<div class=\"footer\">")) {
break;
}
if (temp2.contains("<!--<p></p>")) {
break;
}
if (b) {
content = temp2;
Pattern p = Pattern.compile("<[^>]+>", Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(content);
content = m.replaceAll("");

content = content.replace(" ", "");
content = content.replace(" ", "");
content = content.replace("  ", "");
if(!StringUtils.isEmpty(content)){
bw.write(content);
bw.write("\r\n");

}
}
if (temp2.contains("<div class=\"m_right_content\">")) {
b = true;
}
}
bw.close();
}
}


public static String v1 = "http://www.tfzq.com/ContentList.aspx?nodeid=43&page=";
public static String v2 = "http://www.tfzq.com/";
public static Pattern p = Pattern.compile("<[^>]+>([^<]*)</[^>]+>");
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值