Java初学者实践：httpclient+HTMLParser 的应用_java html parser翻译网页-CSDN博客

使用httpclient调用google在线翻译以及爱词霸在线翻译，并使用HTMLParser对返回的结果进行处理，以此实现一个简单的在线翻译小程序，并利用多线程编程合并两个翻译工具。具体实现如下：

1.multithreadDict.java

 
    01package MultiTread;
 
    02import java.util.Scanner;
 
    03public class multithreadDict {
 
    04    public static void main(String[] args) throws InterruptedException {
 
    05        System.out.print("Input the word > ");
 
    06        Scanner s = new Scanner(System.in);
 
    07        String input= s.nextLine();
 
    08         
 
    09        GoogleDict google=new GoogleDict(input);
 
    10        google.start();
 
    11        //google.sleep(2000);
 
    12        IcibaDict iciba=new IcibaDict(input);
 
    13        iciba.start();
 
    14        //iciba.sleep(4000);
 
    15    }
 
    16}

注：本来我想先显示google翻译的结果，后显示爱词霸翻译的结果，但即使使用sleep函数也很难保证这一点。

Java线程调度是Java多线程的核心，只有良好的调度，才能充分发挥系统的性能，提高程序的执行效率。

这里要明确的一点，不管程序员怎么编写调度，只能最大限度的影响线程执行的次序，而不能做到精准控制。

线程休眠是使线程让出CPU的最简单的做法之一，线程休眠时候，会将CPU资源交给其他线程，以便能轮换执行，当休眠一定时间后，线程会苏醒，进入准备状态等待执行。

线程休眠的方法是Thread.sleep(long millis) 和Thread.sleep(long millis, int nanos) ，均为静态方法。简单说，哪个线程调用sleep，就休眠哪个线程。

2.GoogleDict.java

 
    01package MultiTread;
 
    02import java.io.File;
 
    03import java.io.FileWriter;
 
    04import java.net.URI;
 
    05import org.apache.http.HttpEntity;
 
    06import org.apache.http.HttpResponse;
 
    07import org.apache.http.client.HttpClient;
 
    08import org.apache.http.client.methods.HttpGet;
 
    09import org.apache.http.client.utils.URIUtils;
 
    10import org.apache.http.impl.client.DefaultHttpClient;
 
    11import org.apache.http.util.EntityUtils;
 
    12import org.htmlparser.Node;
 
    13import org.htmlparser.NodeFilter;
 
    14import org.htmlparser.Parser;
 
    15import org.htmlparser.filters.OrFilter;
 
    16import org.htmlparser.filters.TagNameFilter;
 
    17import org.htmlparser.util.NodeList;
 
    18import org.htmlparser.visitors.TextExtractingVisitor;
 
    19class GoogleDict extends Thread
 
    20{
 
    21    private String searchterm=null;
 
    22    public GoogleDict(String input)
 
    23    {
 
    24        this.searchterm=input;
 
    25    }
 
    26    public void run()
 
    27    {
 
    28        String text=null;
 
    29        //http://www.google.com/dictionary?source=translation&hl=zh-CN&q=computer&langpair=en|zh-CN
 
    30        try
 
    31        {
 
    32        HttpClient httpclient = new DefaultHttpClient();
 
    33        String searchstring = "source=translation&hl=zh-CN&q=" + searchterm +"&langpair=en%7Czh-CN";
 
    34        URI uri=URIUtils.createURI("http", "www.google.com", -1, "/dictionary", searchstring, null);
 
    35        HttpGet httpget = new HttpGet(uri);
 
    36        HttpResponse response = httpclient.execute(httpget);
 
    37        HttpEntity entity = response.getEntity();
 
    38         
 
    39        if (entity != null) {
 
    40            Parser parser = new Parser(EntityUtils.toString(entity));
 
    41            parser.setEncoding("gb2312");
 
    42            //NodeFilter filter_tab_content =new OrFilter( new  TagNameFilter("div"),new TagNameFilter("span"));
 
    43            NodeFilter filter_tab_content=new TagNameFilter("div");
 
    44            //NodeFilter filter_tab_content=new TagNameFilter("span");
 
    45            NodeList nodelist_tab_content = parser.parse(filter_tab_content);
 
    46            int length = nodelist_tab_content.size();
 
    47            if(searchterm.getBytes().length==searchterm.length())
 
    48            {
 
    49                for (int i = 10; i < length-3; i++) {
 
    50                    Node node_tab_content = nodelist_tab_content.elementAt(i);
 
    51                    Parser parser_tab_content = new Parser(node_tab_content
 
    52                        .toHtml());
 
    53                    TextExtractingVisitor visitor_tab_content = newTextExtractingVisitor();
 
    54                    parser_tab_content.visitAllNodesWith(visitor_tab_content);
 
    55                    text = text+"/n"+visitor_tab_content.getExtractedText().trim();
 
    56                }
 
    57            }
 
    58            else
 
    59            {
 
    60                for (int i = 8; i < length-3; i++) {
 
    61                    Node node_tab_content = nodelist_tab_content.elementAt(i);
 
    62                    Parser parser_tab_content = new Parser(node_tab_content
 
    63                        .toHtml());
 
    64                    TextExtractingVisitor visitor_tab_content = newTextExtractingVisitor();
 
    65                    parser_tab_content.visitAllNodesWith(visitor_tab_content);
 
    66                    text = text+"/n"+visitor_tab_content.getExtractedText().trim();
 
    67                }
 
    68            }
 
    69            text=text.replaceAll("相关搜索", "相关搜索:");
 
    70            text=text.replaceAll("null", "");
 
    71            text=text.replaceAll("/n/n", "/n");
 
    72            text=text.replaceAll("/n/n", "/n");
 
    73            text=text.replaceAll("/n/n", "/n");
 
    74             
 
    75            System.out.println("-----------------------------------------" +
 
    76                    "谷歌翻译-------------------------------------------");
 
    77            System.out.println(uri);
 
    78             
 
    79            System.out.println(text);
 
    80            File f = new File("D://study/Java/GoogleDict/" + searchterm + ".txt");
 
    81            FileWriter fw = new FileWriter(f);
 
    82            fw.write(text);
 
    83            fw.flush();
 
    84            fw.close();
 
    85            }
 
    86        }
 
    87        catch(Exception e)
 
    88        {
 
    89            e.printStackTrace();
 
    90        }
 
    91    }
 
    92}

注：在使用HTMLParser处理google翻译返回的结果时，由于同时存在<span>...</span>,<div>...</div>,<span><div>...</div></span>三种标签，导致处理比较困难，个人对HTMLParser库也不是很熟悉，所以最终所得结果并不是很满意。

3.IcibaDict.java

 
    01package MultiTread;
 
    02import java.io.File;
 
    03import java.io.FileWriter;
 
    04import org.apache.http.HttpEntity;
 
    05import org.apache.http.HttpResponse;
 
    06import org.apache.http.client.HttpClient;
 
    07import org.apache.http.client.methods.HttpGet;
 
    08import org.apache.http.impl.client.DefaultHttpClient;
 
    09import org.apache.http.util.EntityUtils;
 
    10import org.htmlparser.Node;
 
    11import org.htmlparser.NodeFilter;
 
    12import org.htmlparser.Parser;
 
    13import org.htmlparser.filters.AndFilter;
 
    14import org.htmlparser.filters.HasAttributeFilter;
 
    15import org.htmlparser.filters.TagNameFilter;
 
    16import org.htmlparser.util.NodeList;
 
    17import org.htmlparser.visitors.TextExtractingVisitor;
 
    18class IcibaDict extends Thread
 
    19{
 
    20    private String searchterm=null;
 
    21    public IcibaDict(String input)
 
    22    {
 
    23        this.searchterm=input;
 
    24    }
 
    25    public void run()
 
    26    {
 
    27        String text=null,webContent=null;
 
    28        try
 
    29        {
 
    30            HttpClient httpclient = new DefaultHttpClient();
 
    31            String searchstring = "http://www.iciba.com/" + searchterm + "/";
 
    32            HttpGet httpget = new HttpGet(searchstring);
 
    33            HttpResponse response = httpclient.execute(httpget);
 
    34            HttpEntity entity = response.getEntity();
 
    35            if (entity != null) {
 
    36                String content=EntityUtils.toString(entity);
 
    37                content=content.replaceAll("<a href", "  <a href");
 
    38                Parser parser = new Parser(content);
 
    39                parser.setEncoding("gb2312");
 
    40                NodeFilter filter_tab_content = new AndFilter(new TagNameFilter(
 
    41                        "div"), new HasAttributeFilter("class", "tab_content"));
 
    42                NodeList nodelist_tab_content = parser.parse(filter_tab_content);
 
    43                int length = nodelist_tab_content.size();
 
    44                for (int i = 0; i < length; i++) {
 
    45                    Node node_tab_content = nodelist_tab_content.elementAt(i);
 
    46                    Parser parser_tab_content = new Parser(node_tab_content
 
    47                        .toHtml());
 
    48                    TextExtractingVisitor visitor_tab_content = newTextExtractingVisitor();
 
    49                    parser_tab_content.visitAllNodesWith(visitor_tab_content);
 
    50                    text = text+"/n"+visitor_tab_content.getExtractedText().trim();
 
    51                }
 
    52                parser.reset();
 
    53                NodeFilter filter_web = new AndFilter(new TagNameFilter(
 
    54                        "div"), new HasAttributeFilter("class", "content_block"));
 
    55                NodeList nodelist_web = parser.parse(filter_web);
 
    56                Node node_web = nodelist_web.elementAt(0);
 
    57                if(node_web!=null)
 
    58                {
 
    59                    Parser parser_web = new Parser(node_web.toHtml());
 
    60                    TextExtractingVisitor visitor_web = new TextExtractingVisitor();
 
    61                    parser_web.visitAllNodesWith(visitor_web);
 
    62                    webContent=visitor_web.getExtractedText().trim();
 
    63                }
 
    64                text=text+webContent;
 
    65                text=text.replaceAll("                      ", "");
 
    66                text=text.replaceAll("              ", "");
 
    67                text=text.replaceAll("      ", "/n");
 
    68                text=text.replaceAll("/n/n/n", "/n");
 
    69                text=text.replaceAll("/n/n", "/n");
 
    70                text=text.replaceAll("/n/n", "/n");
 
    71                text=text.replaceAll("  ", "");
 
    72                text=text.replace("null", "");
 
    73                text=text.replace("相关搜索", "");
 
    74                text=text.replace("句库","");
 
    75                text=text.replace("韦氏词典","");
 
    76                text=text.replace("Dictionary", "");
 
    77                 
 
    78                System.out.println("*************************************" +
 
    79                        "爱词霸翻译*************************************");
 
    80                System.out.println(searchstring);
 
    81                 
 
    82                System.out.println(text);
 
    83             
 
    84                File f = new File("D://study/Java/IcibaDict/" + searchterm + ".txt");
 
    85                FileWriter fw = new FileWriter(f);
 
    86                fw.write(text);
 
    87                fw.flush();
 
    88                fw.close();
 
    89            }
 
    90        }
 
    91        catch(Exception e)
 
    92        {
 
    93            e.printStackTrace();
 
    94        }
 
    95    }
 
    96}