直接访问www.google显然是无法访问的,附上两个地址:
http://209.85.225.23/
http://173.194.14.53/
这两个地址搜索的后缀是
newwindow=1&q=
不采用这个格式,使用如下格式:
http://209.85.225.23/search?hl=zh&ie=gb2312&q=
q=后面加上搜索的内容。
这样直接访问会被google拒绝,需要仿冒浏览器去访问。
java中,如下设置参数:
URL url=new URL(path);
HttpURLConnection con = (HttpURLConnection) url.openConnection();
con.setRequestProperty("User-Agent", "Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14");
注意,user-agent 后面的浏览器参数,不同的浏览器可能会导致收到的编码方式不同,所以编码出错时,替换后面的参数。
附上完整代码:
package com.search.google;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileWriter;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLEncoder;
import com.mysql.jdbc.Field;
public class GetResult
{
public static void main(String []args) throws Exception
{
new GetResult().process();
}
public void process() throws Exception
{
String str= getHTML("西游记 作者");
System.out.println(str);
parseHTML(str);
}
public String getHTML(String str) throws Exception
{
StringBuilder sb=new StringBuilder();
String path="http://209.85.225.23/search?hl=zh&ie=gb2312&q="+URLEncoder.encode(str,"gb2312");
//String path="http://209.85.225.23/search?hl=zh&ie=UTF-8&q="+str;
URL url=new URL(path);
HttpURLConnection con = (HttpURLConnection) url.openConnection();
con.setRequestProperty("User-Agent", "Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14");
BufferedReader breader = new BufferedReader(new InputStreamReader(con.getInputStream()));
//BufferedReader breader=new BufferedReader(new InputStreamReader(url.openStream()));
String line=null;
File file_out =new File("./test/google");
FileWriter fw = new FileWriter(file_out);
while((line=breader.readLine())!=null)
{
sb.append(line);
fw.write(line);
}
fw.close();
return sb.toString();
}
public void parseHTML(String str)
{
}
}