java抓取热门分类标题_根据输入的url列表文件,多线程抓取url对应的标题,关键词,title,keywords,和descrption,并输出到文件...

1.[代码][Java]代码

import java.io.BufferedReader;

import java.io.BufferedWriter;

import java.io.FileReader;

import java.io.FileWriter;

import java.io.IOException;

import java.net.URL;

import java.util.concurrent.ExecutorService;

import java.util.concurrent.Executors;

import java.util.concurrent.TimeUnit;

import org.jsoup.Connection;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.select.Elements;

/**

* multiple threads get title, key words, description from url list

* @author lujianfeng@miaozhen.com

*/

public class GetTitleRunnable implements Runnable

{

private static BufferedReader br;

private static BufferedWriter bw;

public void run(){

while(true){

try{

String url = null;

synchronized(br){

url = br.readLine();

if(url == null)

break;

}

String title = getTitle(url);

if(title != null)

synchronized(bw){

bw.write(url + "\t" + title + "\n");

}

}catch(IOException e){

throw new RuntimeException(e);

}

}

}

public static void close(){

try{

if(br != null) br.close();

if(bw != null)bw.close();

}catch(IOException e){

throw new RuntimeException(e);

}

}

public GetTitleRunnable(){}

public GetTitleRunnable(String in, String out){

if(br == null || bw == null){

try {

br = new BufferedReader(new FileReader(in));

bw = new BufferedWriter(new FileWriter(out));

} catch (IOException e) {

throw new RuntimeException(e);

}

}

}

public static String getTitle(String url){

Document doc;

try {

new URL(url);

Connection con = Jsoup.connect(url);

con.userAgent("Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)");

doc = con.get();

} catch (IOException e) {

System.out.println(url);

return null;

}

Elements heads = doc.getElementsByTag("head");

StringBuilder sb = new StringBuilder();

for(Element head : heads){

Elements titles = head.getElementsByTag("title");

for(Element title : titles){

sb.append(title.text());

}

sb.append("\t");

Elements keys = head.getElementsByAttributeValue("name", "keywords");

for(Element key : keys){

sb.append(key.attr("content"));

}

sb.append("\n");

Elements descs = head.getElementsByAttributeValue("name", "description");

for(Element desc : descs){

sb.append(desc.attr("content"));

}

}

return sb.length() > 3 ?sb.toString().replaceAll("[\r\n]", ""):null;

}

public void MultiThreadsGetTitle(int threadsNum) throws InterruptedException{

ExecutorService executor = Executors.newCachedThreadPool();

for(int i = 0; i < threadsNum; i++){

executor.execute(new GetTitleRunnable());

System.out.println("thread " + i + " started");

}

executor.shutdown();

executor.awaitTermination(5, TimeUnit.DAYS);

GetTitleRunnable.close();

}

public static void main( String[] args ) throws InterruptedException

{

System.out.print(getTitle("http://www.oschina.net/code/snippet_1417577_48298"));

new GetTitleRunnable(args[0], args[1]).MultiThreadsGetTitle(Integer.parseInt(args[2]));

}

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值