闲来无聊爬了下全国的省市区乡镇居委会的信息,存入到数据。
以后做地址联动选择的时候可能用得着,这次可以精确到居委会
数据来源:国家统计局 2016年统计用区划代码和城乡划分代码(截止2016年07月31日)
http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html
具体代码,也是写的比较随意:
using AngleSharp.Parser.Html;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Threading.Tasks;
namespace CrawlerArea
{
class Program
{
static void Main(string[] args)
{
Console.WriteLine(DateTime.Now);
//省
//f("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html");
//市
//AreaDBEntities areaDBEntities = new AreaDBEntities();
//var data = areaDBEntities.AreaInfoes.ToList();
//foreach (var item in data)
//{
// string url = string.Format("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/{0}.html", item.Code);
// getCityInfo(url, item.Code);
// System.Threading.Thread.Sleep(50);
//}
//Console.WriteLine(DateTime.Now);
//区 / 县
//AreaDBEntities areaDBEntities = new AreaDBEntities();
//var data = areaDBEntities.AreaInfoes.ToList();
//foreach (var item in data)
//{
// string url = string.Format("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/{0}/{1}.html", item.PedarId, item.Code);
// getCountyInfo(url, item.PedarId, item.Code);
// System.Threading.Thread.Sleep(50);
//}
//Console.WriteLine(DateTime.Now);
街道
//AreaDBEntities areaDBEntities = new AreaDBEntities();
//var data = areaDBEntities.AreaInfoes.Where(t => t.PedarId >= 1000).ToList();
//foreach (var item in data)
//{
// string temp = item.Code.ToString();
// string url = string.Format("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/{0}/{1}/{2}.html", temp.Substring(0, 2), temp.Substring(2, 2), item.Code);
// getStreetInfo(url, item.Code);
// System.Threading.Thread.Sleep(50);
//}
//Console.WriteLine(DateTime.Now);
//村委会
AreaDBEntities areaDBEntities = new AreaDBEntities();
var data = areaDBEntities.AreaInfoes.Where(t => t.Code.Length >= 9 ).ToList();
foreach (var item in data)
{
string temp = item.Code.ToString();
string url = string.Format("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/{0}/{1}/{2}/{3}.html", temp.Substring(0, 2), temp.Substring(2, 2), temp.Substring(4, 2), item.Code);
getCommitteeInfo(url, item.Code);
Console.WriteLine(item.Code+"----"+item.Name);
System.Threading.Thread.Sleep(200);
}
Console.WriteLine(DateTime.Now);
Console.WriteLine("OK");
Console.ReadKey();
}
/// <summary>
/// 居委会
/// </summary>
/// <param name="url"></param>
/// <param name="code"></param>
private static void getCommitteeInfo(string url, string code)
{
var htmlString = HttpGet(url);
HtmlParser htmlParser = new HtmlParser();
var data = htmlParser.Parse(htmlString)
.QuerySelectorAll(".villagetr")
.Select(t => t)
.ToList();
List<Node> list = new List<Node>();
foreach (var item in data)
{
var str = item.Children.ToList();
Node node = new Node();
node.code = str[0].InnerHtml;
node.area = str[2].InnerHtml;
list.Add(node);
}
AreaDBEntities areaDBEntities = new AreaDBEntities();
foreach (var item in list)
{
AreaInfo areaInfo = new AreaInfo();
areaInfo.Code = item.code;
areaInfo.Name = item.area;
areaInfo.PedarId =int.Parse( code);
areaDBEntities.AreaInfoes.Add(areaInfo);
}
areaDBEntities.SaveChanges();
}
/// <summary>
/// 街道
/// </summary>
/// <param name="url"></param>
/// <param name="code"></param>
//private static void getStreetInfo(string url, int? code)
//{
// var htmlString = HttpGet(url);
// HtmlParser htmlParser = new HtmlParser();
// var data = htmlParser.Parse(htmlString)
// .QuerySelectorAll(".towntr")
// .Select(t => t)
// .ToList();
// List<Node> list = new List<Node>();
// foreach (var item in data)
// {
// var area = htmlParser.Parse(item.InnerHtml)
// .QuerySelectorAll("a")
// .Select(t => t).ToList();
// foreach (var td in area)
// {
// Node node = new Node();
// node.code = td.GetAttribute("href");
// node.area = td.TextContent;
// list.Add(node);
// }
// }
// AreaDBEntities areaDBEntities = new AreaDBEntities();
// int k = 0;
// foreach (var item in list)
// {
// if (k % 2 != 0)
// {
// AreaInfo areaInfo = new AreaInfo();
// Console.WriteLine(item.code + "----" + item.area);
// string code1 = item.code.Substring(item.code.IndexOf("/") + 1, 9);
// areaInfo.Code = int.Parse(code1);
// areaInfo.Name = item.area;
// areaInfo.PedarId = code;
// areaDBEntities.AreaInfoes.Add(areaInfo);
// }
// k++;
// }
// areaDBEntities.SaveChanges();
// Console.WriteLine();
//}
//private static void getCountyInfo(string url, int? PedarId, int? code)
//{
// if (PedarId == null) return;
// var htmlString = HttpGet(url);
// HtmlParser htmlParser = new HtmlParser();
// var data = htmlParser.Parse(htmlString)
// .QuerySelectorAll(".countytr")
// .Select(t => t)
// .ToList();
// List<Node> list = new List<Node>();
// foreach (var item in data)
// {
// var area = htmlParser.Parse(item.InnerHtml)
// .QuerySelectorAll("a")
// .Select(t => t).ToList();
// foreach (var td in area)
// {
// Node node = new Node();
// node.code = td.GetAttribute("href");
// node.area = td.TextContent;
// list.Add(node);
// }
// }
// AreaDBEntities areaDBEntities = new AreaDBEntities();
// int k = 0;
// foreach (var item in list)
// {
// if (k % 2 != 0)
// {
// AreaInfo areaInfo = new AreaInfo();
// Console.WriteLine(item.code + "----" + item.area);
// string code1 = item.code.Substring(item.code.IndexOf("/") + 1, 6);
// areaInfo.Code = int.Parse(code1);
// areaInfo.Name = item.area;
// areaInfo.PedarId = code;
// areaDBEntities.AreaInfoes.Add(areaInfo);
// }
// k++;
// }
// areaDBEntities.SaveChanges();
// Console.WriteLine();
//}
//private static void getCityInfo(string url, int? PedarId)
//{
// var htmlString = HttpGet(url);
// HtmlParser htmlParser = new HtmlParser();
// var data = htmlParser.Parse(htmlString)
// .QuerySelectorAll(".citytr")
// .Select(t => t)
// .ToList();
// List<Node> list = new List<Node>();
// foreach (var item in data)
// {
// var area = htmlParser.Parse(item.InnerHtml)
// .QuerySelectorAll("a")
// .Select(t => t).ToList();
// foreach (var td in area)
// {
// Node node = new Node();
// node.code = td.GetAttribute("href");
// node.area = td.TextContent;
// list.Add(node);
// }
// }
// AreaDBEntities areaDBEntities = new AreaDBEntities();
// int k = 0;
// foreach (var item in list)
// {
// if (k % 2 != 0)
// {
// AreaInfo areaInfo = new AreaInfo();
// Console.WriteLine(item.code + "----" + item.area);
// string code = item.code.Substring(item.code.IndexOf("/") + 1, 4);
// areaInfo.Code = int.Parse(code);
// areaInfo.Name = item.area;
// areaInfo.PedarId = PedarId;
// areaDBEntities.AreaInfoes.Add(areaInfo);
// }
// k++;
// }
// areaDBEntities.SaveChanges();
// Console.WriteLine();
//}
public static string HttpGet(string url)
{
try
{
Encoding encoding = Encoding.UTF8;
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
request.Method = "GET";
request.Accept = "text/html, application/xhtml+xml, */*";
request.ContentType = "application/json";
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
using (StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.Default))
{
return reader.ReadToEnd();
}
}
catch (Exception ex)
{
//log.Error("WeChatHelper", ex);
return null;
}
}
得到省的信息
//static void f(string url)
//{
// var htmlString = HttpGet(url);
// HtmlParser htmlParser = new HtmlParser();
// var data = htmlParser.Parse(htmlString)
// .QuerySelectorAll(".provincetr")
// .Select(t => t)
// .ToList();
// List<Node> list = new List<Node>();
// foreach (var item in data)
// {
// var area = htmlParser.Parse(item.InnerHtml)
// .QuerySelectorAll("a")
// .Select(t => t).ToList();
// foreach (var td in area)
// {
// Node node = new Node();
// node.code = td.GetAttribute("href");
// node.area = td.TextContent;
// list.Add(node);
// }
// }
// AreaDBEntities areaDBEntities = new AreaDBEntities();
// foreach (var item in list)
// {
// AreaInfo areaInfo = new AreaInfo();
// Console.WriteLine(item.code + "----" + item.area);
// areaInfo.Code = int.Parse(item.code.Substring(0, item.code.IndexOf(".")));
// areaInfo.Name = item.area;
// areaInfo.PedarId = null;
// areaDBEntities.AreaInfoes.Add(areaInfo);
// }
// areaDBEntities.SaveChanges();
// Console.WriteLine();
// //}
//}
}
class Node
{
public string code { get; set; }
public string area { get; set; }
}
class td
{
public string td1 { get; set; }
}
}
等下会将生成数据库脚本分享出来,可以私聊我