/*
Function name: myGetHttpFile2
Description: 爬网页用
Input: URL 例如:http://www.126.com
Output: 字符串,网页的HTML
*/
public String myGetHttpFile2(String url){
String authentication=null;
ArrayList al=new ArrayList();
String PageURL = url;
PageURL = url ;
ArrayList urlCollection=new ArrayList();
Hashtable ht=new Hashtable();
//代理,如果要穿过代理将下面注释取消
//ht=System.getProperties();
//authentication = "Basic " + new sun.misc.BASE64Encoder().encode("代理的用户:代理的密码".getBytes());
//ht.put("proxySet", "true");
//ht.put("proxyHost", "172.16.20.2");
//ht.put("proxyPort", "80");
String urlSource=PageURL;
StringBuffer htmlBuffer=new StringBuffer();
String returnStr=null;
int codeStart=0;
int codeEnd=0;
int linkCount=0;
String getURL=null;
String realURL=null;
String urlText=null;
int from=0;
int lenURL=0;
int firstLink=0;
int endLink=0;
boolean isNewsLink=false;
try
{
URL su = new URL (urlSource);
URLConnection conn = su.openConnection();
//conn.setRequestProperty("Proxy-Authorization", authentication);
InputStream imageSource=new URL(urlSource).openStream();
int ch;
while((ch=imageSource.read())>-1)
{
htmlBuffer.append((char)ch);
}
imageSource.close();
returnStr= new String(htmlBuffer);
returnStr=new String(returnStr.getBytes("ISO8859_1"),"GBK");
}
catch(Exception e)
{
}
if(returnStr!=null){
return returnStr ;
}else{
return "empty" ;
}
}
11-15
11-15