C#小程序实现从百度摘取搜索结果

最新推荐文章于 2021-03-02 18:08:49 发布

BearKeeper

最新推荐文章于 2021-03-02 18:08:49 发布

阅读量621

点赞数

文章标签：百度 c# string html regex input

本文链接：https://blog.csdn.net/BearKeeper/article/details/6032851

版权

百度不使用xhtml，这样使得.NET原有的XML功能就不是那么好用了。

（而且，谁会真正喜欢DOM呢？用起来多累人啊！）

不过百度的页面很不规则，所以迫不得已使用了大量的硬编码。

因此，这个程序对百度的页面设计做了相当多的假设，无法很好的适应百度的页面结构在未来的改变。

还好这种小程序写起来轻松，所以没事改一改也没事。

另外这个程序使用了大量的正则表达式，这可能会使得它在效率上不适合于用来整合各个搜索引擎的结果。

如果需要在一个页面同时展示几个搜索引擎的结果，我建议使用iframe标签，或者呢，就是让后台把网页通过ajax发给前台，然后在前台用js产生页面。

特别注意，程序中使用了FCL中好用的url编码的功能，因此必须额外添加对System.Web这个程序集的引用。

代码——百度机器人

  
  
   
   
   
     1 
   
   using
   
    System;

   
     2 
   
    
   
   using
   
    System.Collections.Generic;

   
     3 
   
    
   
   using
   
    System.Text;

   
     4 
   
    
   
   using
   
    System.Text.RegularExpressions;

   
     5 
   
    
   
   using
   
    System.Web;

   
     6 
   
    
   
   using
   
    System.Net;

   
     7 
   
   using
   
    System.IO;

   
     8 
   
   namespace
   
    baiduRobotStrim

   
     9 
   
   {

   
    10 
   
       
   
   struct
   
    BaiduEntry

   
    11 
   
       {

   
    12 
   
           
   
   public
   
    
   
   string
   
    title, brief, link;

   
    13 
   
       }

   
    14 
   
       
   
   class
   
    Program

   
    15 
   
       {

   
    16 
   
           
   
   static
   
    
   
   string
   
    GetHtml(
   
   string
   
    keyword)

   
    17 
   
           {

   
    18 
   
               
   
   string
   
    url 
   
   =
   
    
   
   @"
   
   http://www.baidu.com/
   
   "
   
   ;

   
    19 
   
               
   
   string
   
    encodedKeyword 
   
   =
   
    HttpUtility.UrlEncode(keyword, Encoding.GetEncoding(
   
   936
   
   ));

   
    20 
   
               
   
   //
   
   百度使用codepage 936字符编码来作为查询串，果然专注于中文搜索……

   
    21 
   
               
   
   //
   
   更不用说，还很喜欢微软

   
    22 
   
               
   
   //
   
   谷歌能正确识别UTF-8编码和codepage这两种情况，不过本身网页在HTTP头里标明是UTF-8的

   
    23 
   
               
   
   //
   
   估计谷歌也不讨厌微软（以及微软的专有规范）
   
   

   
    24 
   
               
   
   string
   
    query 
   
   =
   
    
   
   "
   
   s?wd=
   
   "
   
    
   
   +
   
    encodedKeyword;

   
    25 
   
   

   
    26 
   
               HttpWebRequest req;

   
    27 
   
               HttpWebResponse response;

   
    28 
   
               Stream stream;

   
    29 
   
               req 
   
   =
   
    (HttpWebRequest)WebRequest.Create(url 
   
   +
   
    query);

   
    30 
   
               response 
   
   =
   
    (HttpWebResponse)req.GetResponse();

   
    31 
   
               stream 
   
   =
   
    response.GetResponseStream();

   
    32 
   
               
   
   int
   
    count 
   
   =
   
    
   
   0
   
   ;

   
    33 
   
               
   
   byte
   
   [] buf 
   
   =
   
    
   
   new
   
    
   
   byte
   
   [
   
   8192
   
   ];

   
    34 
   
               
   
   string
   
    decodedString 
   
   =
   
    
   
   null
   
   ;

   
    35 
   
               StringBuilder sb 
   
   =
   
    
   
   new
   
    StringBuilder();

   
    36 
   
               
   
   try
   
   

   
    37 
   
               {

   
    38 
   
                   Console.WriteLine(
   
   "
   
   正在读取网页{0}的内容……
   
   "
   
   , url 
   
   +
   
    query);

   
    39 
   
                   
   
   do
   
   

   
    40 
   
                   {

   
    41 
   
                       count 
   
   =
   
    stream.Read(buf, 
   
   0
   
   , buf.Length);

   
    42 
   
                       
   
   if
   
    (count 
   
   >
   
    
   
   0
   
   )

   
    43 
   
                       {

   
    44 
   
                           decodedString 
   
   =
   
    Encoding.GetEncoding(
   
   936
   
   ).GetString(buf, 
   
   0
   
   , count);

   
    45 
   
                           sb.Append(decodedString);

   
    46 
   
                       }

   
    47 
   
                   } 
   
   while
   
    (count 
   
   >
   
    
   
   0
   
   );

   
    48 
   
               }

   
    49 
   
               
   
   catch
   
   

   
    50 
   
               {

   
    51 
   
                   Console.WriteLine(
   
   "
   
   网络连接失败，请检查网络设置。
   
   "
   
   );

   
    52 
   
               }

   
    53 
   
               
   
   return
   
    sb.ToString();

   
    54 
   
           }

   
    55 
   
           
   
   static
   
    
   
   void
   
    PrintResult(List
   
   <
   
   BaiduEntry
   
   >
   
    entries)

   
    56 
   
           {

   
    57 
   
               
   
   int
   
    count 
   
   =
   
    
   
   0
   
   ;

   
    58 
   
               entries.ForEach(
   
   delegate
   
   (BaiduEntry entry)

   
    59 
   
               {

   
    60 
   
                   Console.WriteLine(
   
   "
   
   找到了百度的第{0}条搜索结果：
   
   "
   
   , count 
   
   +=
   
    
   
   1
   
   );

   
    61 
   
                   
   
   if
   
    (entry.link 
   
   !=
   
    
   
   null
   
   )

   
    62 
   
                   {

   
    63 
   
                       Console.WriteLine(
   
   "
   
   找到了一条链接：
   
   "
   
   );

   
    64 
   
                       Console.WriteLine(entry.link);

   
    65 
   
                   }

   
    66 
   
                   
   
   if
   
    (entry.title 
   
   !=
   
    
   
   null
   
   )

   
    67 
   
                   {

   
    68 
   
                       Console.WriteLine(
   
   "
   
   标题为：
   
   "
   
   );

   
    69 
   
                       Console.WriteLine(entry.title);

   
    70 
   
                   }

   
    71 
   
                   
   
   if
   
    (entry.brief 
   
   !=
   
    
   
   null
   
   )

   
    72 
   
                   {

   
    73 
   
                       Console.WriteLine(
   
   "
   
   下面是摘要：
   
   "
   
   );

   
    74 
   
                       Console.WriteLine(entry.brief);

   
    75 
   
                   }

   
    76 
   
                   Program.Cut();

   
    77 
   
               });

   
    78 
   
           }

   
    79 
   
           
   
   static
   
    
   
   void
   
    simpleOutput()

   
    80 
   
           {

   
    81 
   
               
   
   string
   
    html 
   
   =
   
    
   
   "
   
   <table><tr><td><font>test</font><a>hello</a><br></td></tr></table>
   
   "
   
   ;

   
    82 
   
               Console.WriteLine(RemoveSomeTags(html));

   
    83 
   
           }

   
    84 
   
           
   
   static
   
    
   
   string
   
    RemoveVoidTag(
   
   string
   
    html)

   
    85 
   
           {

   
    86 
   
               
   
   string
   
   [] filter 
   
   =
   
    { 
   
   "
   
   <br>
   
   "
   
    };

   
    87 
   
               
   
   foreach
   
    (
   
   string
   
    tag 
   
   in
   
    filter)

   
    88 
   
               {

   
    89 
   
                   html 
   
   =
   
    html.Replace(tag, 
   
   ""
   
   );

   
    90 
   
               }

   
    91 
   
               
   
   return
   
    html;

   
    92 
   
           }

   
    93 
   
           
   
   static
   
    
   
   string
   
    ReleaseXmlTags(
   
   string
   
    html)

   
    94 
   
           {

   
    95 
   
               
   
   string
   
   [] filter 
   
   =
   
    { 
   
   "
   
   <a.*?>
   
   "
   
   , 
   
   "
   
   </a>
   
   "
   
   , 
   
   "
   
   <em>
   
   "
   
   , 
   
   "
   
   </em>
   
   "
   
   , 
   
   "
   
   <b>
   
   "
   
   , 
   
   "
   
   </b>
   
   "
   
   , 
   
   "
   
   <font.*?>
   
   "
   
   , 
   
   "
   
   </font>
   
   "
   
    };

   
    96 
   
               
   
   foreach
   
    (
   
   string
   
    tag 
   
   in
   
    filter)

   
    97 
   
               {

   
    98 
   
                   html 
   
   =
   
    Regex.Replace(html, tag, 
   
   ""
   
   );

   
    99 
   
               }

   
   100 
   
               
   
   return
   
    html;

   
   101 
   
           }

   
   102 
   
   

   
   103 
   
           
   
   static
   
    
   
   string
   
    RemoveSomeTags(
   
   string
   
    html)

   
   104 
   
           {

   
   105 
   
               html 
   
   =
   
    RemoveVoidTag(html);

   
   106 
   
               html 
   
   =
   
    ReleaseXmlTags(html);

   
   107 
   
               
   
   return
   
    html;

   
   108 
   
           }

   
   109 
   
           
   
   static
   
    
   
   void
   
    Cut()

   
   110 
   
           {

   
   111 
   
               Console.WriteLine(
   
   "
   
   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   
   "
   
   );

   
   112 
   
           }

   
   113 
   
           
   
   static
   
    
   
   void
   
    MainProc(
   
   string
   
    input)

   
   114 
   
           {

   
   115 
   
               MainProc(input, 
   
   false
   
   );

   
   116 
   
           }

   
   117 
   
           
   
   static
   
    
   
   void
   
    MainProc(
   
   string
   
    input, 
   
   bool
   
    tagsForBrief)

   
   118 
   
           {

   
   119 
   
               Regex r 
   
   =
   
    
   
   new
   
    Regex(
   
   "
   
   <table.*?</table>
   
   "
   
   , RegexOptions.IgnoreCase);

   
   120 
   
               
   
   //
   
   提取出(<table>,</table>)对，并等待进一步处理。
   
   

   
   121 
   
               Match m 
   
   =
   
    r.Match(input);

   
   122 
   
               List
   
   <
   
   string
   
   >
   
    collection 
   
   =
   
    
   
   new
   
    List
   
   <
   
   string
   
   >
   
   ();

   
   123 
   
               
   
   while
   
    (m.Success)

   
   124 
   
               {

   
   125 
   
                   collection.Add(m.Value);

   
   126 
   
                   
   
   //
   
   找出tagname为table的节点并存储到collection变量中
   
   

   
   127 
   
                   m 
   
   =
   
    m.NextMatch();

   
   128 
   
               }

   
   129 
   
               List
   
   <
   
   BaiduEntry
   
   >
   
    entries 
   
   =
   
    
   
   new
   
    List
   
   <
   
   BaiduEntry
   
   >
   
   ();

   
   130 
   
               collection.ForEach(
   
   delegate
   
   (
   
   string
   
    entry)

   
   131 
   
               {

   
   132 
   
                   r 
   
   =
   
    
   
   new
   
    Regex(
   
   "
   
   <td.*?>(.*)</td>
   
   "
   
   , RegexOptions.IgnoreCase);

   
   133 
   
                   
   
   if
   
   (r.IsMatch(entry))

   
   134 
   
                   {
   
   //
   
   从entry字符串里捕获到的就是百度里存储在每个table标签里的td标签了。

   
   135 
   
                       
   
   //
   
   现阶段中，百度页面里有几个table标签是兄弟节点的关系，

   
   136 
   
                       
   
   //
   
   第一个table标签是一个广告，剩下的table标签刚好都是搜索结果。

   
   137 
   
                       
   
   //
   
   理想状态下input字符串里只有几个由table标签组织的搜索结果项。

   
   138 
   
                       
   
   //
   
   理应使用预处理过的字符串来调用本函数
   
   

   
   139 
   
                       m 
   
   =
   
    r.Match(entry);

   
   140 
   
                       
   
   string
   
    html 
   
   =
   
    m.Groups[
   
   1
   
   ].Value;
   
   //
   
   直接使用捕获分组1的值。

   
   141 
   
                       
   
   //
   
   html变量里存储着td节点的innerHTML，那里有真正的搜索结果
   
   

   
   142 
   
                       BaiduEntry baidu 
   
   =
   
    
   
   new
   
    BaiduEntry();

   
   143 
   
                       r 
   
   =
   
    
   
   new
   
    Regex(
   
   "
   
   <a.*?href=/
   
   "
   
   (.
   
   *?
   
   )/
   
   "
   
   .*?>
   
   "
   
   , RegexOptions.IgnoreCase);

   
   144 
   
                       
   
   if
   
    (r.IsMatch(html))

   
   145 
   
                       {

   
   146 
   
                           
   
   string
   
    linkString 
   
   =
   
    r.Match(html).Groups[
   
   1
   
   ].Captures[
   
   0
   
   ].Value;

   
   147 
   
                           baidu.link 
   
   =
   
    linkString;

   
   148 
   
                       }

   
   149 
   
                       r 
   
   =
   
    
   
   new
   
    Regex(
   
   "
   
   <font.*</font>
   
   "
   
   );

   
   150 
   
                       
   
   //
   
   td节点下有一些嵌套了2层的font标签，把这个大的font标签拿下来。
   
   

   
   151 
   
                       html 
   
   =
   
    r.Match(html).Value;
   
   //
   
   现在html变量里存储着比较浓缩的信息了。
   
   

   
   152 
   
   

   
   153 
   
                       r 
   
   =
   
    
   
   new
   
    Regex(
   
   "
   
   <font.*?>(.*?)</font>
   
   "
   
   );

   
   154 
   
                       Match contentMatch 
   
   =
   
    r.Match(html);

   
   155 
   
                       
   
   if
   
    (contentMatch.Success)

   
   156 
   
                       {

   
   157 
   
                           
   
   string
   
    title 
   
   =
   
    contentMatch.Groups[
   
   1
   
   ].Captures[
   
   0
   
   ].Value;

   
   158 
   
                           title 
   
   =
   
    RemoveSomeTags(title);

   
   159 
   
                           baidu.title 
   
   =
   
    title;

   
   160 
   
                           contentMatch 
   
   =
   
    contentMatch.NextMatch();

   
   161 
   
                           
   
   if
   
    (contentMatch.Success)

   
   162 
   
                           {

   
   163 
   
                               
   
   string
   
    brief 
   
   =
   
    contentMatch.Groups[
   
   1
   
   ].Captures[
   
   0
   
   ].Value;

   
   164 
   
                               
   
   int
   
    splitIndex 
   
   =
   
    brief.IndexOf(
   
   "
   
   <font
   
   "
   
   );

   
   165 
   
                               
   
   if
   
    (splitIndex 
   
   >
   
    
   
   -
   
   1
   
   )

   
   166 
   
                                   brief 
   
   =
   
    brief.Substring(
   
   0
   
   , splitIndex);

   
   167 
   
                               
   
   if
   
    (
   
   !
   
   tagsForBrief)

   
   168 
   
                                   brief 
   
   =
   
    RemoveSomeTags(brief);

   
   169 
   
                               
   
   //
   
   如果不需要带有HTML格式的摘要，那么就处理掉HTML标签
   
   

   
   170 
   
                               baidu.brief 
   
   =
   
    brief;

   
   171 
   
                           }

   
   172 
   
                       }

   
   173 
   
                       
   
   else
   
   

   
   174 
   
                       {

   
   175 
   
                           
   
   if
   
    (html 
   
   ==
   
    
   
   ""
   
   ) 
   
   return
   
   ;

   
   176 
   
                           Console.WriteLine(
   
   "
   
   怪了，这里没有找到任何结果。
   
   "
   
   );

   
   177 
   
                           Console.WriteLine(
   
   "
   
   如果百度已经更改了页面的结构那么程序需要重新设计。
   
   "
   
   );

   
   178 
   
                           Console.WriteLine(
   
   "
   
   Mark:
   
   "
   
   );

   
   179 
   
                           Console.WriteLine(html);

   
   180 
   
                           Cut();

   
   181 
   
                           Cut();

   
   182 
   
                           Cut();

   
   183 
   
                       }

   
   184 
   
                       entries.Add(baidu);

   
   185 
   
                   }

   
   186 
   
               });

   
   187 
  
  
  
  
   
   188             PrintResult(entries);
  
  
  
  
   
   189         }
  
  
  
  
   
   190         public static void Main(string[] args)
  
  
  
  
   
   191         {

192              Console.WriteLine( " 请输入一个关键字。 " );
193              string keyword;
194             keyword = Console.ReadLine();
195            Console.WriteLine( " 正在从百度上获取结果，请稍等…… " );
196            string input;
197            input = GetHtml(keyword);
198            Regex r = new Regex( " <table.*class=/ " result/ " [//s//S]*</table> " , RegexOptions.IgnoreCase);
199            input = r.Match(input).Value;
200            MainProc(input);
201            Console.ReadKey( true );
202        }
203 }
204 }
205

BearKeeper

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
C#小程序实现从百度摘取搜索结果

百度不使用xhtml，这样使得.NET原有的XML功能就不是那么好用了。（而且，谁会真正喜欢DOM呢？用起来多累人啊！）不过百度的页面很不规则，所以迫不得已使用了大量的硬编码。因此，这个程序对百度的页面设计做了相当多的假设，无法很好的适应百度的页面结构在未来的改变。还好这种小程序写起来轻松，所以没事改一改也没事。另外这个程序使用了大量的正则表达式，这可能会使得它在效率上不适合于用来整合各个搜索引擎的结果。如果需要在一个页面同时展示几个搜索引擎的结果，我建议使用iframe标签，或者呢，就是让后台把网页通过a
复制链接

扫一扫