利用c#从网上爬取成语的解释
一年前写的代码,今天整理文件夹时偶然发现,代码写的很糟糕,原本打算删掉的,但又想到当时两眼昏沉地熬夜编代码,心中生出了一丝不舍,今天把它放到这里,就当是留个纪念吧!
代码的功能是从本地的txt成语文档中提取成语,利用c#的 WebRequest从网页中爬取程序的解释并分别存入txt文档和access数据库中。
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
using System.Data.OleDb;
using System.IO;
using System.Net;
namespace 成语查询v1._0
{
public partial class Form1 : Form
{
//Boolean accessFlag = false;//标记是否要存到数据库里面
//int radioFlag =1;//=1 radioButton1选中,=2 radioButton2选中
int num = 0;
string path="";//成语源的地址
string dbName;//数据库名字
StreamWriter streamWriter;
StreamReader fileReader;
string accessPath;
OleDbConnection connect;
public Form1()
{
InitializeComponent();
}
private void button1_Click(object sender, EventArgs e)
{
if (textBox2.Text == "" || textBox2 == null)
{
MessageBox.Show("请输入txt文档的名字!");
return;
}
//if (File.Exists(textBox2.Text + ".txt"))
//streamWriter = File.CreateText(textBox2.Text + ".txt");
streamWriter = new StreamWriter(File.OpenWrite(textBox2.Text + ".txt"),Encoding.UTF8);
ADOX.Catalog catalog = new ADOX.Catalog();
if (textBox3.Text == null || textBox3.Text== "")
{
MessageBox.Show("请输入数据库的名字!");
return;
}
dbName = textBox3.Text + ".mdb";
if (File.Exists(dbName))
{
accessPath = @"Provider=Microsoft.Jet.OLEDB.4.0;Data Source=" + dbName;
connect = new OleDbConnection(accessPath);
connect.Open();
}
else
{
accessPath = @"Provider=Microsoft.Jet.OLEDB.4.0;Data Source=" + dbName + ";Jet OLEDB:Engine Type=5";
try
{
catalog.Create(accessPath);
accessPath = @"Provider=Microsoft.Jet.OLEDB.4.0;Data Source=" + dbName;
connect = new OleDbConnection(accessPath);
connect.Open();
string command = "create table data (id int primary key ,name char(255) ,content longchar,number1 char(25),number2 char(25))";
OleDbCommand dbCommand = new OleDbCommand(command, connect);
dbCommand.ExecuteNonQuery();
}
catch (Exception ee)
{
MessageBox.Show(ee.ToString());
}
}
MessageBox.Show("连接本地成功!");
}
private void button2_Click(object sender, EventArgs e)
{
int i = 0;
int flag = 0;
if (File.Exists("data.txt"))
{
StreamReader data = new StreamReader("data.txt", Encoding.UTF8);
flag = int.Parse(data.ReadLine());
data.Close();
}
string name;//成语的名字
string content;//成语的解释
string url;
string format1;
string format2;
string result1;//百度上搜索结果的个数
string result2;//搜狗上搜索结果数
if (path == "" || path == null)
{
MessageBox.Show("请打开成语的源!");
return;
}
num = getColNumber(path);
try
{
fileReader = new StreamReader(path,Encoding.UTF8);
string command;
while ((name = fileReader.ReadLine()) != null)
{
if (i > flag)
{
textBox1.Text = name;
if (radioButton1.Checked == true)
serchFromWeb1(name, out content);
else
serchFromWeb2(name, out content);
richTextBox1.Text = content;
url = @"http://www.baidu.com/s?wd=";
format1 = "class=\"nums_text\">百度为您找到相关结果约";
format2 = "个</span>";
serchNumber(name, url, format1, format2, out result1);
textBox4.Text = result1;
url = @"https://www.sogou.com/web?query=";
format1 = "搜狗已为您找到约";
format2 = "条相关结果";
serchNumber(name, url, format1, format2, out result2);
textBox5.Text = result2;
result2 = null;
streamWriter.WriteLine(name + " " + content + " " + result1 + " " + result2, Encoding.ASCII);
command = "insert into data(id,name,content,number1,number2) values('" + i + "','" + name + "','" + content + "','" + result1 + "','" + result2 + "')";
OleDbCommand dbComman = new OleDbCommand(command, connect);
dbComman.ExecuteNonQuery();
}
i++;
progressBar1.Value = i*100 / num;
}
}
catch(Exception ex)
{
if(File.Exists("data.txt"))
File.Delete("data.txt");
StreamWriter dataWriter = File.CreateText("data.txt");
dataWriter.WriteLine(i);
streamWriter.Close();
dataWriter.Close();
fileReader.Close();
MessageBox.Show("可能是网络出现问题,已经保存进度请求退出");
return;
}
finally
{
connect.Close();
fileReader.Close();
MessageBox.Show("查询完成!");
}
}
/// <summary>
/// 从百度百科web服务器中的到要查询成语的解释
/// </summary>
/// <param name="url"></param>
/// <param name="name"></param>
/// <param name="content"></param>
public void serchFromWeb1(string name,out string content)
{
string realUrl = @"https://baike.baidu.com/item/";
realUrl += name;
string former1 = "<meta name=\"description\" content=\"";
string former2 = "...";
string html;
int indexS;//formr1的在html中的起始位置
int indexE;
int length;
content = "";
StreamReader webReader;
WebRequest webRequest = WebRequest.Create(realUrl);
WebResponse responds= webRequest.GetResponse();
Stream stream = responds.GetResponseStream();
webReader = new StreamReader(stream, Encoding.UTF8);
html = webReader.ReadToEnd().Substring(0,1000);//通过查看respond得到的html可以知道,词语的解释在1000字以内
try
{
indexS = html.IndexOf(former1, 50);
indexE = html.IndexOf(former2, 50);
length = indexE - indexS - former1.Length;
if (indexS <= 0 || indexE <= 0 || length <= 0 || indexE < indexS)
{
MessageBox.Show("没有在百度百科上找到["+name+"]这个词的解释~~");
}
else
{
content = html.Substring(indexS + former1.Length, length);
}
}catch(Exception ee)
{
MessageBox.Show(ee.ToString());
return;
}
}
/// <summary>
/// 从www.51bc.net网站上获得成语的解释
/// </summary>
/// <param name="name"></param>
/// <param name="content"></param>
public void serchFromWeb2(string name,out string content)
{
try
{
WebClient webCilent = new WebClient();
int indexS;
int indexE;
int length;
string url = @"http://www.51bc.net/cy/serach.php";
content = "";
string postString = "f_type=chengyu&f_type2=&f_key=" + name;
byte[] posDate = Encoding.Default.GetBytes(postString);
webCilent.Headers.Add("Content-Type", "application/x-www-form-urlencoded");
string resultDate = Encoding.Default.GetString(webCilent.UploadData(url, posDate));
string html = resultDate.Substring(4460, 1000);
string format1 = "</u></a></td>";
string format2 = "</tr>";
indexS = html.IndexOf(format1) + 22;//从html文档中扣取想要的到的字符串的format1的位置
indexE = html.IndexOf(format2,indexS) - 9;
length = indexE - indexS - format1.Length;
if (indexS <= 0 || indexE <= 0 || length <= 0 || indexE < indexS)
{
MessageBox.Show("没有在51上找到[" + name + "]这个词的解释~~");
}
else
{
string result = html.Substring(indexS + format1.Length, length);
content = result;
}
}
catch (Exception ee)
{
MessageBox.Show("没有在51上找到[" + name + "]这个词的解释~~");
content = "";
}
}
/// <summary>
/// 统计文档中的总行数
/// </summary>
/// <param name="thePath"></param>
/// <returns></returns>
public int getColNumber(string thePath)
{
int i=0;
StreamReader theReader = new StreamReader(path);
while (theReader.ReadLine() != null)
{
i++;
}
theReader.Close();
return i;
}
/// <summary>
/// 在搜索引擎上搜索的结果数
/// </summary>
/// <param name="word"></param>
/// <param name="result"></param>
void serchNumber(string word,string url,string format1,string format2, out string result)
{
//string webUrl = @"http://www.baidu.com/s?wd=";
string realUrl = url + word;
string html;
int indexS;
int indexE;
result = "";
try
{
WebRequest request = WebRequest.Create(realUrl);
WebResponse respons = request.GetResponse();
Stream stream = respons.GetResponseStream();
StreamReader streamReader = new StreamReader(stream, Encoding.UTF8);
html = streamReader.ReadToEnd().Substring(5000);//随便规定的一个数可能实际意义并不是特别大
indexS = html.IndexOf(format1);
indexE = html.IndexOf(format2, indexS);
result = html.Substring(indexS + format1.Length, indexE - indexS - format1.Length);
}
catch (Exception ee)
{
throw ee;
}
}
private void button3_Click(object sender, EventArgs e)
{
if (openFileDialog1.ShowDialog() == DialogResult.OK)
{
path = openFileDialog1.FileName;
}
}
private void radioButton1_CheckedChanged(object sender, EventArgs e)
{
if(radioButton1.Checked == true)
{
radioButton2.Checked=false;
//radioFlag = 1;
}
}
private void radioButton2_CheckedChanged(object sender, EventArgs e)
{
if (radioButton2.Checked == true)
{
radioButton1.Checked = false;
}
}
}
}