package socket;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class URLClient {
private ArrayList<String> list;
protected HttpURLConnection connection;
public void PrintList(){
if(list!=null)
for(int i=0;i<list.size();i++){
System.out.println(list.get(i));
}
System.out.println("总共提取出"+list.size()+"条数据");
}
public void findText(String expression, String text) {
// System.out.println("在字符"+text+" 查找表达式为:"+expression);
Pattern p = Pattern.compile(expression); // 正则表达式
Matcher m = p.matcher(text); // 操作的字符串
int i = 0;
while (m.find()) {
list.add(m.group());
// System.out.println(m.group());
i++;
}
}
public static void main(String[] args) {
URLClient client = new URLClient();
// http://localhost:8081/TreeDemo/Print.action?companyId=1
// client.getDocumentAt("http://www.baidu.com");
System.out.println(client.getDocumentAt("http://www.baidu.com/s?ie=GB2312&wd=天涯&rn=50"));
client.PrintList();
}
public String getDocumentAt(String urlString) {
StringBuffer document = new StringBuffer();
BufferedReader reader = null;
try {
URL url = new URL(urlString);
connection = (HttpURLConnection) url.openConnection();
connection.setRequestProperty("User-Agent", "Sogou Orion spider/3.0");
reader = new BufferedReader(new InputStreamReader(
connection.getInputStream(),"gb2312"));
list = new ArrayList<String>();
long start = System.currentTimeMillis();
String line = null;
while ((line = reader.readLine()) != null){
findText("http://([//w-]+//.)+[//w-]+(/[//w- ./?%&=]*)?",line);
document.append(line + "/n");
}
System.out.println("Spend Times:"+(System.currentTimeMillis()-start)+"ms");
} catch (MalformedURLException e) {
System.out.println("Unable to connect to URL: " + urlString);
} catch (IOException e) {
// e.printStackTrace();
System.out.println("IOException when connecting to URL: "
+ urlString);
} finally{
try {
if(reader!=null)
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
connection.disconnect();
}
return document.toString();
}
}
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class URLClient {
private ArrayList<String> list;
protected HttpURLConnection connection;
public void PrintList(){
if(list!=null)
for(int i=0;i<list.size();i++){
System.out.println(list.get(i));
}
System.out.println("总共提取出"+list.size()+"条数据");
}
public void findText(String expression, String text) {
// System.out.println("在字符"+text+" 查找表达式为:"+expression);
Pattern p = Pattern.compile(expression); // 正则表达式
Matcher m = p.matcher(text); // 操作的字符串
int i = 0;
while (m.find()) {
list.add(m.group());
// System.out.println(m.group());
i++;
}
}
public static void main(String[] args) {
URLClient client = new URLClient();
// http://localhost:8081/TreeDemo/Print.action?companyId=1
// client.getDocumentAt("http://www.baidu.com");
System.out.println(client.getDocumentAt("http://www.baidu.com/s?ie=GB2312&wd=天涯&rn=50"));
client.PrintList();
}
public String getDocumentAt(String urlString) {
StringBuffer document = new StringBuffer();
BufferedReader reader = null;
try {
URL url = new URL(urlString);
connection = (HttpURLConnection) url.openConnection();
connection.setRequestProperty("User-Agent", "Sogou Orion spider/3.0");
reader = new BufferedReader(new InputStreamReader(
connection.getInputStream(),"gb2312"));
list = new ArrayList<String>();
long start = System.currentTimeMillis();
String line = null;
while ((line = reader.readLine()) != null){
findText("http://([//w-]+//.)+[//w-]+(/[//w- ./?%&=]*)?",line);
document.append(line + "/n");
}
System.out.println("Spend Times:"+(System.currentTimeMillis()-start)+"ms");
} catch (MalformedURLException e) {
System.out.println("Unable to connect to URL: " + urlString);
} catch (IOException e) {
// e.printStackTrace();
System.out.println("IOException when connecting to URL: "
+ urlString);
} finally{
try {
if(reader!=null)
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
connection.disconnect();
}
return document.toString();
}
}