程序就是个好东西,人很难完成的任务,它只需很短时间就搞定。
下面我们来采集一个房产网站上的所有普陀区的小区列表
改地址为:http://sh.fangjia.com/xiaoqu/--e-{0}|r-%E6%99%AE%E9%99%80%E5%8C%BA
{0}为页码,共35页,C#实现代码如下:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;
namespace Hourse
{
class Program
{
private static string uri;
private static string file;
static void Main(string[] args)
{
uri = "http://sh.fangjia.com/xiaoqu/--e-{0}|r-%E6%99%AE%E9%99%80%E5%8C%BA";
file = AppDomain.CurrentDomain.BaseDirectory + "data.txt";
if (!File.Exists(file)) File.Create(file);
Console.WriteLine("--------------------------");
Console.WriteLine("开始采集数据,请等待...");
Console.WriteLine("--------------------------");
int pages = 35;
int counts = 0;
for (int i = 1; i <= pages; i++)
{
counts += OperateInfo(i);
}
Console.WriteLine("采集完成!共"+counts+"条,文件存放在"+file);
Console.ReadKey();
}
static int OperateInfo(int page)
{
string _uri = uri.Replace("{0}", page.ToString());
WebClient client = new WebClient();
byte[] datas= client.DownloadData(_uri);
string txt = Encoding.UTF8.GetString(datas);
/*
string txt=@"
<div class=""fsize14 margin-bottom8"">
<strong>
<a href=""/xiaoqu-4796-%E6%9B%B9%E6%9D%A8%E4%BA%8C%E6%9D%91"" target=""_blank"">
曹杨二村</a>
</strong>
</div>
<div class=""margin-bottom5"">
普陀区
曹杨路1107弄,</div>
";
*/
//匹配小区列表
string pattern = "<div class=\"fsize14 margin-bottom8\">\\s+<strong>\\s+<a\\s+[^>]+>\\s+(.+?)</a>\\s+</strong>"+
"\\s+</div>\\s+<div class=\"margin-bottom5\">([^<]+)</div>";
//获取所有的匹配
string name, address; //小区名字和地址
MatchCollection mc = Regex.Matches(txt, pattern);
foreach (Match m in mc)
{
name = Regex.Replace(m.Value, pattern, "$1");
address = Regex.Replace(m.Value, pattern, "$2");
address = Regex.Replace(address, "[\\s,( )]+", "");
Save(name+" "+address);
}
Console.WriteLine("第" + page + "页采集到" + mc.Count + "条!");
return mc.Count;
}
static void Save(string str)
{
using (StreamWriter sw = new StreamWriter(file, true, Encoding.UTF8))
{
sw.WriteLine(str);
sw.Flush();
}
}
}
}
运行程序: