package com.xh.crawle;
import java.io.BufferedReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.sql.rowset.spi.SyncFactory;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
/**
* 多线程
* @author kali
*
*/
public class Test implements Runnable{
String content;
HttpClient client;
GetMethod getMethod;
String myDomain;
static String strHomePage;
static List<String> urls=new ArrayList<String>();
int i=0;
public String getContent(String url)
{ i+=1;
client=new HttpClient();
getMethod=new GetMethod(url);
StringBuffer buffer=new StringBuffer();
InputStream in;
try {
int status=client.executeMethod(getMethod);
if(status==HttpStatus.SC_OK)
{
// content=getMethod.getResponseBodyAsString();
in=getMethod.getResponseBodyAsStream();
content=in_str(in);
FileWriter fileWriter=new FileWriter("F:\\jd2\\jd_"+i+"_"+Thread.currentThread().getName()+".html");
buffer.append(content);
fileWriter.write(buffer.toString());
fileWriter.flush();
fileWriter.close();
//System.out.println(">>>>"+content);
}
getUrl(content);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return content;
}
public static void main(String[] args) throws InterruptedException {
Test test=new Test();
strHomePage="http://www.jd.com";
for(int i=0;i<10;i++)
{
new Thread(test).start();
Thread.currentThread().sleep(500);
}
}
public List<String> getUrl(String cont)
{
String tmpStr = cont;
myDomain=getDomain();
String regUrl = "(?<=(href=)[\"]?[\']?)[http://][^\\s\"\'\\?]*("+ myDomain + ")[^\\s\"\'>]*";
//正则
Pattern p = Pattern.compile(regUrl, Pattern.CASE_INSENSITIVE);//Pattern.CASE_INSENSITIVE 大小写不敏感
Matcher m = p.matcher(tmpStr);
boolean blnp = m.find();
while(blnp)
{
if (!urls.contains(m.group(0)))
{
urls.add(m.group(0));
System.out.println("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"+m.group(0));
System.out.println("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"+urls.size());
}
tmpStr = tmpStr.substring(m.end(), tmpStr.length());
m = p.matcher(tmpStr);
blnp = m.find();
}
return urls;
}
public String getDomain() {
String reg = "(?<=http\\://[a-zA-Z0-9]{0,100}[.]{0,1})[^.\\s]*?\\.(com|cn|net|org|biz|info|cc|tv)";
//String reg = "(?<=http\\://\\w{0,100}[.]{0,1})[^.\\s]*?\\.(com|cn|net|org|biz|info|cc|tv)";
Pattern p = Pattern.compile(reg, Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(strHomePage);
boolean blnp = m.find();
if (blnp == true) {
return m.group(0);
}
return null;
}
public synchronized void Crawle(List<String> list)
{
while(!list.isEmpty())
{
getContent(list.get(0));
System.out.println("*************************************************"+list.get(0));
System.out.println("*************************************************"+urls.size());
list.remove(0);
}
}
public String in_str(InputStream in) throws IOException
{
InputStreamReader inputStreamReader=new InputStreamReader(in);
StringBuffer buffer=new StringBuffer();
BufferedReader bufferedReader=new BufferedReader(inputStreamReader);
String line=bufferedReader.readLine();
while(line!=null)
{
buffer.append(line);
line=bufferedReader.readLine();
}
return buffer.toString();
}
@Override
public void run() {
getContent(strHomePage);
if(!urls.isEmpty())
{
Crawle(urls);
}
}
}
java写一个爬虫
最新推荐文章于 2022-08-04 17:07:39 发布