找工作神器的主要原理是,根据查询条件去访问相应的网站,通过程序拿到相应网站的HTML代码,再通过相应的正则表达式取相应的信息,再去截取相应的重要信息,再将相应的信息显示在表格里。這里有使用到线程池异步的方式,同时会去三个网站抓取数据,并且会抓取一条解析一条就会在表格里显示出来,这样就避免等待太长时间还看不到结果的尴尬,程序发布后各位园友可以下载程序看看效果如何,还请您能提出宝贵的意见。
整个程序显示的界面效果图:
图片上面显示的是查询条件,输入查询条件后点击查询,下面显示的查询出的数据,分别有三个页签(猎聘网、智联招聘和前程无忧),表格分别显示职位名称、公司名称、公司性质、公司规模、月薪/年薪、工作地点、工作经验、最低学历和发布时间等等信息,日后根据需要还可以继续扩充想要看到的信息,实现看到信息的基本上跟网站上的信息差不多。
查询条件
解析:现在输入的条件有工作地点、薪水范围(上限、下限),关键词、必须包含的关键词,现在暂时只支持以上几种条件,日后可能会继续加入更多的的查询条件(公司名称、公司性质、工作经验、学历要求等等条件,日后再扩展),使查询更方便。
启动查询的代码如下:
通过启动线程池异步的方式同时启动三个网站数据的加载,这样增强用户体验的效果,并且会拿到一条数据解析一条数据,并且及时显示在表格了,这样用户不需要等待太长的时间而看不到结果。
程序启动首先会加载城市对应的ID的一个字典,数据加载如下:

1.前程无忧
前程无忧我相信应该是很多园友找工作的首选,博主就是在这上面注册了简历,并且每次换工作都是在这上面取得了成功,感觉还挺不错,祝愿各位园友都能找到自己称心如意的工作,只要我们大家都一起努力应该都没有问题的。
下面介绍实现逻辑:
1 #region * 前程无忧 2 /// <summary> 3 /// 线程池启动调用的方法 4 /// </summary> 5 /// <param name="obj"></param> 6 private void Get51JobData(object obj) 7 { 8 string workAddress = this.txtAddress.Text.Trim();//工作地点 9 string workAddressId = string.Empty;//工作地点ID 10 string keyWord = this.txtKeyWord.Text.Trim();//关键词 11 string upperSalary = this.txtSalary1.Text.Trim();//薪水范围 12 string lowerSalary = this.txtSalary2.Text.Trim();//薪水范围 13 string mustKey = string.Empty;//是否包含关键词 14 15 jobInfoList2.Clear(); 16 curJobInfo2 = null; 17 dt2.Rows.Clear(); 18 this.Invoke((MethodInvoker)delegate 19 { 20 this.gcJob2.DataSource = dt2; 21 }); 22 Thread th = null;//搜索线程 23 if (th != null) 24 { 25 th.Abort(); 26 th = null; 27 } 28 29 //根据输入的城市找出城市ID 30 KeyValuePair<string, string> kv = dic2.FirstOrDefault(t => t.Value.Contains(workAddress)); 31 if (kv.Key == null) 32 { 33 XtraMessageBox.Show("无法搜索该工作地点", "警告", MessageBoxButtons.OK, MessageBoxIcon.Warning); 34 return; 35 } 36 workAddressId = kv.Key; 37 //勾选包含关键词 38 if (this.chkMustKey.Checked) 39 { 40 mustKey = this.txtMustKey.Text.Trim(); 41 } 42 43 //调用接口 44 JobFactory tws = new JobFactory("51Job", workAddress, workAddressId, keyWord, upperSalary, lowerSalary, mustKey); 45 IJob job = tws.GetJob(); 46 if (job != null) 47 { 48 job.GetJobEnd -= new GetJobEndEventHandler(job_GetJob2End); 49 job.GetJobEnd += new GetJobEndEventHandler(job_GetJob2End); 50 th = new Thread(new ThreadStart(job.GetJobInfoList)); 51 th.IsBackground = true; 52 th.Start(); 53 } 54 } 55 56 /// <summary> 57 /// 表格增加一行数据 58 /// </summary> 59 /// <param name="o"></param> 60 /// <param name="e"></param> 61 private void job_GetJob2End(object o, JobInfo e) 62 { 63 this.Invoke((MethodInvoker)delegate 64 { 65 if (e != null) 66 { 67 jobInfoList2.Add(e); 68 curJobInfo2 = e; 69 this.gvJob2.AddNewRow(); 70 } 71 else 72 { 73 this.layoutControlGroup2.Enabled = true; 74 } 75 }); 76 } 77 78 /// <summary> 79 /// 表格增加行 80 /// </summary> 81 /// <param name="sender"></param> 82 /// <param name="e"></param> 83 private void gvJob2_InitNewRow(object sender, DevExpress.XtraGrid.Views.Grid.InitNewRowEventArgs e) 84 { 85 try 86 { 87 DataRowView dr = this.gvJob2.GetRow(e.RowHandle) as DataRowView; 88 dr["Url"] = curJobInfo2.Url;//网站链接 89 dr["Position"] = curJobInfo2.Position;//职位名称 90 dr["Company"] = curJobInfo2.Company;//公司名称 91 dr["Nature"] = curJobInfo2.Nature;//公司性质 92 dr["Scale"] = curJobInfo2.Scale;//公司规模 93 dr["Salary"] = curJobInfo2.Salary;//月薪/年薪 94 dr["Address"] = curJobInfo2.Address;//工作地点 95 dr["Experience"] = curJobInfo2.Experience;//工作经验 96 dr["Education"] = curJobInfo2.Education;//最低学历 97 dr["Time"] = curJobInfo2.Time;//发布时间 98 99 this.gvJob2.UpdateCurrentRow(); 100 this.gvJob2.RefreshData(); 101 this.gvJob2.MoveLast(); 102 } 103 catch 104 { 105 XtraMessageBox.Show("添加行失败"); 106 } 107 } 108 109 /// <summary> 110 /// 双击行打开当前行链接 111 /// </summary> 112 /// <param name="sender"></param> 113 /// <param name="e"></param> 114 private void gcJob2_DoubleClick(object sender, EventArgs e) 115 { 116 string uri = this.gvJob2.GetFocusedDataRow()["Url"].ToString(); 117 System.Diagnostics.Process.Start(uri); 118 } 119 #endregion
以上三个函数的作用分别是线程池启动调用的方法、表格增加一行数据、表格增加行和双击行打开当前行链接四个方法,实现这四个方法即可获取前程无忧的数据,那么获取HTML内容和解析HTML需用另外一个类实现,实现这个类如下:
1 public class JobFrom51Job : IJob 2 { 3 #region * 私有字段 4 private string url = @"http://search.51job.com/jobsearch/search_result.php?"; 5 6 /// <summary> 7 /// 工作地点 8 /// </summary> 9 private string workAddress; 10 /// <summary> 11 /// 工作地点ID 12 /// </summary> 13 private string workAddressId; 14 /// <summary> 15 /// 关键词 16 /// </summary> 17 private string keyWord; 18 /// <summary> 19 /// 包含词 20 /// </summary> 21 private string mustKey; 22 #endregion 23 24 public JobFrom51Job(string workAddress, string workAddressId, string keyWord, string mustKey) 25 { 26 this.workAddress = workAddress; 27 this.workAddressId = workAddressId; 28 this.keyWord = keyWord; 29 this.mustKey = mustKey; 30 } 31 32 public event GetJobEndEventHandler GetJobEnd; 33 public void GetJobInfoList() 34 { 35 try 36 { 37 StringBuilder condition = new StringBuilder(); 38 condition.Append("jobarea=" + workAddressId); 39 if (!string.IsNullOrEmpty(keyWord)) 40 { 41 keyWord = System.Web.HttpUtility.UrlEncode(keyWord, Encoding.GetEncoding("gb2312")); 42 condition.Append("&keyword=" + keyWord); 43 } 44 condition.Append("&keywordtype=2"); 45 46 url = url + condition.ToString(); 47 string html = GetHtmlCode.GetByget(url, "gb2312"); 48 GetJobInfoFromPage(html); 49 50 int pageCount = 0; 51 //页面数量 52 string pageCountRegexStr = "(?<=name=\"jobid_count\"\\s*?value=\")\\d+(?=\">)"; 53 Regex pageCountRegex = new Regex(pageCountRegexStr); 54 pageCount = (int.Parse(pageCountRegex.Match(html).Value) + 29) / 30; 55 56 for (int i = 2; i <= pageCount; i++) 57 { 58 string url0 = url + string.Format("&curr_page={0}", i); 59 html = GetHtmlCode.GetByget(url0, "gb2312"); 60 GetJobInfoFromPage(html); 61 } 62 if (GetJobEnd != null) 63 { 64 GetJobEnd(null, null); 65 } 66 } 67 catch (Exception exMsg) 68 { 69 throw new Exception(exMsg.Message); 70 } 71 } 72 73 private void GetJobInfoFromPage(string pageStr) 74 { 75 try 76 { 77 pageStr = Regex.Replace(pageStr, "\\s", ""); 78 //职位所有信息 79 string jobInfoRegexStr = "(?<=<trclass=\"tr0\").+?(?=</tr>)"; 80 Regex jobInfoRegex = new Regex(jobInfoRegexStr); 81 MatchCollection jobInfoMC = jobInfoRegex.Matches(pageStr); 82 //-- 83 foreach (Match m in jobInfoMC) 84 { 85 if (m.Value.Contains(workAddress)) 86 { 87 //职位URL 88 string urlRegexStr = "(?<=<aadid=\"\"href=\").+?(?=\")"; 89 string url0 = Regex.Match(m.Value, urlRegexStr).Value; 90 GetJobInfoFromUrl(url0); 91 } 92 } 93 } 94 catch (Exception exMsg) 95 { 96 throw new Exception(exMsg.Message); 97 } 98 } 99 100 // 正则表达式过滤:正则表达式,要替换成的文本 101 private static readonly string[][] Filters = 102 { 103 new[] { @"(?is)<script.*?>.*?</script>", "" }, 104 new[] { @"(?is)<style.*?>.*?</style>", "" }, 105 new[] { @"(?is)<!--.*?-->", "" }, // 过滤Html代码中的注释 106 new[] { @"(?is)<footer.*?>.*?</footer>",""}, 107 new[] { "(?is) <div style=\"width:470px; padding-left:5px;\">.*?</div>",""}, 108 new[] { "(?is)<div id=\"top\">.*?</iframe> </div></div>",""}, 109 new[] { "(?is)<div class=\"grayline\" id=\"announcementbody\">.*?</li></ul> </div>",""} 110 }; 111 112 private void GetJobInfoFromUrl(string url) 113 { 114 try 115 { 116 JobInfo info = new JobInfo(); 117 //-- 118 string pageStr = GetHtmlCode.GetByget(url, "gb2312"); 119 if (string.IsNullOrEmpty(pageStr)) 120 { 121 return; 122 } 123 //-- 124 pageStr = pageStr.Replace("\r\n", "");//替换换行符 125 // 获取html,body标签内容 126 string body = string.Empty; 127 string bodyFilter = @"(?is)<body.*?</body>"; 128 Match m = Regex.Match(pageStr, bodyFilter); 129 if (m.Success) 130 { 131 body = m.ToString().Replace("<tr >", "<tr>").Replace("\r\n", ""); 132 } 133 // 过滤样式,脚本等不相干标签 134 foreach (var filter in Filters) 135 { 136 body = Regex.Replace(body, filter[0], filter[1]); 137 } 138 //-- 139 if (!string.IsNullOrEmpty(mustKey) && !body.Contains(mustKey)) 140 { 141 return; 142 } 143 body = Regex.Replace(body, "\\s", ""); 144 145 info.Url = url; 146 string basicInfoRegexStr0 = "<tdclass=\"sr_bt\"colspan=\"2\">(.*?)</td>"; //职位名称 147 string position = Regex.Match(body, basicInfoRegexStr0).Value; 148 if (string.IsNullOrEmpty(position)) 149 { 150 basicInfoRegexStr0 = "<tdclass=\"sr_bt\"colspan=\"3\">(.*?)</td>"; 151 position = Regex.Match(body, basicInfoRegexStr0).Value; 152 } 153 info.Position = string.IsNullOrEmpty(position) ? "" : position.Substring(position.IndexOf(">") + 1, position.IndexOf("</") - position.IndexOf(">") - 1); 154 155 string basicInfoRegexStr1 = ".html\">(.*?)</a>";//公司名称 156 string company = Regex.Match(body, basicInfoRegexStr1).Value; 157 info.Company = string.IsNullOrEmpty(company) ? "" : company.Substring(company.IndexOf(">") + 1, company.IndexOf("</a>") - company.IndexOf(">") - 1); 158 159 string basicInfoRegexStr2 = "工作地点:</td><tdclass=\"txt_2\">(.*?)</td>";//工作地点 160 string address = Regex.Match(body, basicInfoRegexStr2).Value; 161 info.Address = string.IsNullOrEmpty(address) ? "" : address.Substring(address.IndexOf("\">") + 2, address.LastIndexOf("</td>") - address.IndexOf("\">") - 2); 162 163 string basicInfoRegexStr3 = "公司性质:</strong> (.*?)<br><br><strong>";//公司性质 164 string nature = Regex.Match(body, basicInfoRegexStr3).Value; 165 if (string.IsNullOrEmpty(nature)) 166 { 167 basicInfoRegexStr3 = "公司行业:</strong> (.*?)<br><br><strong>"; 168 nature = Regex.Match(body, basicInfoRegexStr3).Value; 169 } 170 info.Nature = string.IsNullOrEmpty(nature) ? "" : nature.Substring(26, nature.IndexOf("<br>") - 26);//公司性质 171 172 string basicInfoRegexStr4 = "公司规模:</strong> (.*?)</td>";//公司规模 173 string scale = Regex.Match(body, basicInfoRegexStr4).Value; 174 info.Scale = string.IsNullOrEmpty(scale) ? "" : scale.Substring(26, scale.IndexOf("</td>") - 26); 175 176 string basicInfoRegexStr5 = "工作年限:</td><tdclass=\"txt_2\">(.*?)</td>";//工作经验 177 string experience = Regex.Match(body, basicInfoRegexStr5).Value; 178 info.Experience = string.IsNullOrEmpty(experience) ? "" : experience.Substring(experience.IndexOf("\">") + 2, experience.LastIndexOf("</td>") - experience.IndexOf("\">") - 2); 179 180 string basicInfoRegexStr6 = "学 历:</td><tdclass=\"txt_2\">(.*?)</td>";//学历 181 string education = Regex.Match(body, basicInfoRegexStr6).Value; 182 info.Education = string.IsNullOrEmpty(education) ? "" : education.Substring(education.IndexOf("\">") + 2, education.LastIndexOf("</td>") - education.IndexOf("\">") - 2); 183 184 string basicInfoRegexStr7 = "薪水范围:</td><tdclass=\"txt_2\">(.*?)</td>";//月薪 185 string salary = Regex.Match(body, basicInfoRegexStr7).Value; 186 info.Salary = string.IsNullOrEmpty(salary) ? "" : salary.Substring(salary.IndexOf("\">") + 2, salary.LastIndexOf("</td>") - salary.IndexOf("\">") - 2); 187 188 string basicInfoRegexStr8 = "发布日期:</td><tdclass=\"txt_2\">(.*?)</td>";//发布时间 189 string time = Regex.Match(body, basicInfoRegexStr8).Value; 190 info.Time = string.IsNullOrEmpty(time) ? "" : time.Substring(time.IndexOf("\">") + 2, time.LastIndexOf("</td>") - time.IndexOf("\">") - 2); ; 191 192 if (GetJobEnd != null) 193 { 194 GetJobEnd(pageStr, info); 195 } 196 } 197 catch (Exception exMsg) 198 { 199 throw new Exception(exMsg.Message); 200 } 201 } 202 }
以上这个类的作用是分别根据网址获取HTML内容,再根据正则表达式获取招聘相关信息,再通过函数截取相关字段的信息,再组装到前台界面,实现数据的显示,这个里面有一个逻辑就是动态每一条招聘信息的连接,再根据连接去获取HTML信息,相当于这中间有两层解析XML的过程。
2.智联招聘
智联招聘是我自己每次找工作的备选项,每次把前程无忧上的所有招聘信息全部看完后,就会在智联招聘上浏览下,感觉还挺不错的,不知各位园友有没有试下,不过会有很多与前程无忧是重复的招聘信息,所以还得靠自己去区分。
下面介绍实现逻辑:
1 public class JobFromZhiLian : IJob 2 { 3 #region 私有字段 4 private string url = @"http://sou.zhaopin.com/Jobs/SearchResult.ashx?"; 5 /// <summary> 6 /// 工作地点 7 /// </summary> 8 private string workAddress; 9 /// <summary> 10 /// 关键词 11 /// </summary> 12 private string keyWord; 13 /// <summary> 14 /// 工资范围 15 /// </summary> 16 private string upperSalary; 17 /// <summary> 18 /// 工资范围 19 /// </summary> 20 private string lowerSalary; 21 /// <summary> 22 /// 包含词 23 /// </summary> 24 private string mustKey; 25 #endregion 26 27 public JobFromZhiLian(string workAddress, string keyWord, string upperSalary, string lowerSalary, string mustKey) 28 { 29 this.workAddress = workAddress; 30 this.keyWord = keyWord; 31 this.upperSalary = upperSalary; 32 this.lowerSalary = lowerSalary; 33 this.mustKey = mustKey; 34 } 35 36 public event GetJobEndEventHandler GetJobEnd; 37 public void GetJobInfoList() 38 { 39 try 40 { 41 StringBuilder condition = new StringBuilder(); 42 workAddress = HttpUtility.UrlEncode(workAddress, Encoding.GetEncoding("utf-8")); 43 condition.Append("jl=" + workAddress); 44 if (!string.IsNullOrEmpty(keyWord)) 45 { 46 keyWord = HttpUtility.UrlEncode(keyWord, Encoding.GetEncoding("utf-8")); 47 condition.Append("&kw=" + keyWord); 48 } 49 condition.Append("&sm=1"); 50 if (!string.IsNullOrEmpty(upperSalary)) 51 { 52 condition.Append("&sf=" + upperSalary); 53 } 54 if (!string.IsNullOrEmpty(lowerSalary)) 55 { 56 condition.Append("&st=" + lowerSalary); 57 } 58 59 url = url + condition.ToString(); 60 string html = GetHtmlCode.GetByget(url, "utf-8"); 61 GetJobInfoFromPage(html); 62 63 //页面数量 64 string pageCountRegexStr = "(?<=οnkeypress=\"zlapply.searchjob.enter2Page\\(this,event,)\\d+"; 65 Regex pageCountRegex = new Regex(pageCountRegexStr); 66 string pageCountStr = pageCountRegex.Match(html).Groups[0].Value; 67 int pageCount = 0; 68 int.TryParse(pageCountStr, out pageCount); 69 70 for (int i = 2; i <= pageCount; i++) 71 { 72 string url0 = url + string.Format("&p={0}", i); 73 html = GetHtmlCode.GetByget(url0, "utf-8"); 74 GetJobInfoFromPage(html); 75 } 76 if (GetJobEnd != null) 77 { 78 GetJobEnd(null, null); 79 } 80 } 81 catch (Exception exMsg) 82 { 83 throw new Exception(exMsg.Message); 84 } 85 } 86 87 88 // 正则表达式过滤:正则表达式,要替换成的文本 89 private static readonly string[][] Filters = 90 { 91 new[] { @"(?is)<script.*?>.*?</script>", "" }, 92 new[] { @"(?is)<style.*?>.*?</style>", "" }, 93 new[] { @"(?is)<!--.*?-->", "" } // 过滤Html代码中的注释 94 }; 95 96 private void GetJobInfoFromPage( string pageStr) 97 { 98 try 99 { 100 JobInfo info = new JobInfo(); 101 //-- 102 if (string.IsNullOrEmpty(pageStr)) 103 { 104 return; 105 } 106 //-- 107 pageStr = pageStr.Replace("\r\n", "");//替换换行符 108 // 获取html,body标签内容 109 string body = string.Empty; 110 string bodyFilter = @"(?is)<body.*?</body>"; 111 Match m = Regex.Match(pageStr, bodyFilter); 112 if (m.Success) 113 { 114 body = m.ToString().Replace("<tr >", "<tr>").Replace("\r\n", ""); 115 } 116 // 过滤样式,脚本等不相干标签 117 foreach (var filter in Filters) 118 { 119 body = Regex.Replace(body, filter[0], filter[1]); 120 } 121 ////-- 122 //if (!string.IsNullOrEmpty(mustKey) && !body.Contains(mustKey)) 123 //{ 124 // return; 125 //} 126 body = Regex.Replace(body, "\\s", ""); 127 bodyFilter = "(?is)<divclass=\"newlist_list_content\"id=\"newlist_list_content_table\">.*?</dd></dl></div></div></div>"; 128 Match m1 = Regex.Match(body, bodyFilter); 129 if (m1.Success) 130 { 131 body = m1.ToString(); 132 } 133 134 135 136 137 //info.Url = xurl; 138 139 if (GetJobEnd != null) 140 { 141 GetJobEnd(pageStr, info); 142 } 143 144 //pageStr = Regex.Replace(pageStr, "\\s| |<br>|<strong>|</strong>|<b>|</b>", ""); 145 ////职位所有信息 146 //string jobInfoRegexStr = "(?<=<tableclass=\"search-result-tab\">)[\\S\\s]+?(?=</table>)"; 147 //Regex jobInfoRegex = new Regex(jobInfoRegexStr); 148 //MatchCollection jobInfoMC = jobInfoRegex.Matches(pageStr); 149 //foreach (Match m in jobInfoMC) 150 //{ 151 // if (!string.IsNullOrEmpty(mustKey) && !m.Value.Contains(mustKey)) 152 // { 153 // return; 154 // } 155 156 // JobInfo info = new JobInfo(); 157 158 // //职位名称,url和公司名称 159 // string basicInfoRegexStr = "(?<=<ahref=\")([\\w.:+?()/%=#&]+)\"target=\"_blank\".*?>([\\s\\S]+?)(?=</a>)"; 160 // //地点、公司性质、公司规模、经验、学历、职位月薪 161 // string basicInfoRegexStr0 = "(?<=地点:)[-/\\w]+(?=</span>)"; 162 // string basicInfoRegexStr1 = "(?<=公司性质:)[-/\\w]+(?=</span>)"; 163 // string basicInfoRegexStr2 = "(?<=公司规模:)[-/\\w]+(?=</span>)"; 164 // string basicInfoRegexStr3 = "(?<=经验:)[-/\\w]+(?=</span>)"; 165 // string basicInfoRegexStr4 = "(?<=学历:)[-/\\w]+(?=</span>)"; 166 // string basicInfoRegexStr5 = "(?<=职位月薪:)[-/\\w]+(?=</span>)"; 167 // //发布时间 168 // string timeInfoRegexStr = "(?<=releasetime\">)\\d{1,2}-\\d{1,2}-\\d{1,2}"; 169 170 // Regex basicInfoRegex = new Regex(basicInfoRegexStr); 171 // MatchCollection basicInfoMC = basicInfoRegex.Matches(m.Value); 172 // info.Url = basicInfoMC[0].Groups[1].Value; 173 // info.Position = basicInfoMC[0].Groups[2].Value; 174 // info.Company = basicInfoMC[1].Groups[2].Value; 175 // Regex basicInfoRegex0 = new Regex(basicInfoRegexStr0); 176 // info.Address = new Regex(basicInfoRegexStr0).Match(m.Value).Value; 177 // info.Nature = new Regex(basicInfoRegexStr1).Match(m.Value).Value; 178 // info.Scale = new Regex(basicInfoRegexStr2).Match(m.Value).Value; 179 // info.Experience = new Regex(basicInfoRegexStr3).Match(m.Value).Value; 180 // info.Education = new Regex(basicInfoRegexStr4).Match(m.Value).Value; 181 // info.Salary = new Regex(basicInfoRegexStr5).Match(m.Value).Value; 182 // Regex timeInfoRegex = new Regex(timeInfoRegexStr); 183 // info.Time = timeInfoRegex.Match(m.Value).Value; 184 185 186 // if (GetJobEnd != null) 187 // { 188 // GetJobEnd(pageStr, info); 189 // } 190 //} 191 } 192 catch (Exception exMsg) 193 { 194 throw new Exception(exMsg.Message); 195 } 196 } 197 }
以上为智联招聘解析HTML相关类,以上逻辑中正则表达式还在完善中,还未完全实现成功,正则表达式还有问题。
3.猎聘网
猎聘网也是最近一两年才兴起的,這个网站上基本上都是很多猎头发布的信息,开的工资大多是都是十多二十万年薪的岗位,只要你具备這个实力可以去這个网站看看,应该会有所收获的,不过這个网站也有部分企业自己发布的招聘信息,如果前面两个网站都没有看到自己满意的求职信息,那么這个网站也可以是自己求职的一个补充,不知各位博友是不是支持我这种观点。
下面介绍实现逻辑:
1 public class JobFromLiePin : IJob 2 { 3 #region * 私有字段 4 private string url = @"http://www.liepin.com/zhaopin/?"; 5 6 //基本信息 7 private string basicInfoRegexStr = "<a title=[\\s\\S]+?</a>"; 8 9 /// <summary> 10 /// 工作地点 11 /// </summary> 12 private string workAddress; 13 /// <summary> 14 /// 工作地点ID 15 /// </summary> 16 private string workAddressId; 17 /// <summary> 18 /// 关键词 19 /// </summary> 20 private string keyWord; 21 /// <summary> 22 /// 包含词 23 /// </summary> 24 private string mustKey; 25 #endregion 26 27 public JobFromLiePin(string workAddress, string workAddressId, string keyWord, string mustKey) 28 { 29 this.workAddress = workAddress; 30 this.workAddressId = workAddressId; 31 this.keyWord = keyWord; 32 this.mustKey = mustKey; 33 } 34 35 public event GetJobEndEventHandler GetJobEnd; 36 public void GetJobInfoList() 37 { 38 try 39 { 40 StringBuilder condition = new StringBuilder(); 41 condition.AppendFormat("dqs={0}", workAddressId); 42 condition.Append("&searchField=3"); 43 if (!string.IsNullOrEmpty(keyWord)) 44 { 45 keyWord = HttpUtility.UrlEncode(keyWord, Encoding.GetEncoding("utf-8")); 46 condition.Append("&key=" + keyWord); 47 } 48 condition.Append("&pubTime=30"); 49 string xurl = string.Empty; 50 for (int i = 0; i < 100; i++) 51 { 52 if (i > 0) 53 { 54 xurl = url + condition.ToString() + "&curPage=" + i; 55 } 56 else 57 { 58 xurl = url + condition.ToString(); 59 } 60 string html = GetHtmlCode.GetByget(xurl, "utf-8"); 61 if (string.IsNullOrEmpty(html)) 62 { 63 break; 64 } 65 GetJobInfoFromPage(html); 66 } 67 } 68 catch (Exception exMsg) 69 { 70 throw new Exception(exMsg.Message); 71 } 72 } 73 74 private void GetJobInfoFromPage(string pageStr) 75 { 76 try 77 { 78 MatchCollection ms = Regex.Matches(pageStr, basicInfoRegexStr); 79 //--url 80 string urlRegex = "(?<=href=\")([\\w.:+?()/%=#&]+)"; 81 //-- 82 foreach (Match m in ms) 83 { 84 if (m.Value.Contains(workAddress)) 85 { 86 string url0 = Regex.Match(m.Value, urlRegex).Value; 87 GetJobInfoFromUrl(url0); 88 } 89 } 90 if (GetJobEnd != null) 91 { 92 GetJobEnd(null, null); 93 } 94 } 95 catch (Exception exMsg) 96 { 97 throw new Exception(exMsg.Message); 98 } 99 } 100 101 // 正则表达式过滤:正则表达式,要替换成的文本 102 private static readonly string[][] Filters = 103 { 104 new[] { @"(?is)<script.*?>.*?</script>", "" }, 105 new[] { @"(?is)<style.*?>.*?</style>", "" }, 106 new[] { @"(?is)<!--.*?-->", "" }, // 过滤Html代码中的注释 107 new[] { @"(?is)<footer.*?>.*?</footer>",""}, 108 //new[] { "(?is)<div class=\"job-require bottom-job-require\">.*?</div></div>",""} 109 new[] { @"(?is)<h3>常用链接:.*?</ul>",""} 110 }; 111 112 private void GetJobInfoFromUrl(string url) 113 { 114 try 115 { 116 JobInfo info = new JobInfo(); 117 //-- 118 string pageStr = GetHtmlCode.GetByget(url, "utf-8"); 119 if (string.IsNullOrEmpty(pageStr)) 120 { 121 return; 122 } 123 //-- 124 pageStr = pageStr.Replace("\r\n", "");//替换换行符 125 // 获取html,body标签内容 126 string body = string.Empty; 127 string bodyFilter = @"(?is)<body.*?</body>"; 128 Match m = Regex.Match(pageStr, bodyFilter); 129 if (m.Success) 130 { 131 body = m.ToString().Replace("<tr >", "<tr>").Replace("\r\n", ""); 132 } 133 // 过滤样式,脚本等不相干标签 134 foreach (var filter in Filters) 135 { 136 body = Regex.Replace(body, filter[0], filter[1]); 137 } 138 //-- 139 if (!string.IsNullOrEmpty(mustKey) && !body.Contains(mustKey)) 140 { 141 return; 142 } 143 body = Regex.Replace(body, "\\s", ""); 144 145 info.Url = url; 146 147 string basicInfoRegexStr0 = "<h1title=([\\s\\S]+?)>(.*?)</h1>"; //职位名称 148 string position = Regex.Match(body, basicInfoRegexStr0).Value; 149 info.Position = string.IsNullOrEmpty(position) ? "" : position.Substring(position.IndexOf(">") + 1, position.IndexOf("</") - position.IndexOf(">") - 1);//职位名称 150 151 string basicInfoRegexStr1 = "</h1><h3>(.*?)</h3>";//公司名称 152 string company = Regex.Match(body, basicInfoRegexStr1).Value; 153 info.Company = string.IsNullOrEmpty(company) ? "" : company.Substring(company.IndexOf("<h3>") + 4, company.IndexOf("</h3>") - company.IndexOf("<h3>") - 4);//公司名称 154 155 string basicInfoRegexStr2 = "<divclass=\"resumeclearfix\"><span>(.*?)</span>";//工作地点 156 string address = Regex.Match(body, basicInfoRegexStr2).Value; 157 info.Address = string.IsNullOrEmpty(address) ? "" : address.Substring(address.IndexOf("<span>") + 6, address.IndexOf("</") - address.IndexOf("<span>") - 6);//工作地点 158 159 string basicInfoRegexStr3 = "<li><span>企业性质:</span>(.*?)</li>";//公司性质 160 string nature = Regex.Match(body, basicInfoRegexStr3).Value; 161 info.Nature = string.IsNullOrEmpty(nature) ? "" : nature.Substring(nature.IndexOf("</span>") + 7, nature.IndexOf("</li>") - nature.IndexOf("</span>") - 7);//公司性质 162 163 if (string.IsNullOrEmpty(info.Nature)) 164 { 165 string basicInfoRegexStr3_1 = "<br><span>性质:</span>(.*?)<br>"; 166 string nature_1 = Regex.Match(body, basicInfoRegexStr3_1).Value; 167 info.Nature = string.IsNullOrEmpty(nature_1) ? "" : nature_1.Substring(nature_1.IndexOf("</span>") + 7, nature_1.LastIndexOf("<br>") - nature_1.IndexOf("</span>") - 7);//公司性质 168 } 169 170 string basicInfoRegexStr4 = "<li><span>企业规模:</span>(.*?)</li>";//公司规模 171 string scale = Regex.Match(body, basicInfoRegexStr4).Value; 172 info.Scale = string.IsNullOrEmpty(scale) ? "" : scale.Substring(scale.IndexOf("</span>") + 7, scale.IndexOf("</li>") - scale.IndexOf("</span>") - 7);//公司规模 173 174 if (string.IsNullOrEmpty(info.Scale)) 175 { 176 string basicInfoRegexStr4_1 = "<br><span>规模:</span>(.*?)<br>"; 177 string scale_1 = Regex.Match(body, basicInfoRegexStr4_1).Value; 178 info.Scale = info.Nature = string.IsNullOrEmpty(scale_1) ? "" : scale_1.Substring(scale_1.IndexOf("</span>") + 7, scale_1.LastIndexOf("<br>") - scale_1.IndexOf("</span>") - 7);//公司规模 179 } 180 181 string basicInfoRegexStr5 = "<spanclass=\"noborder\">(.*?)</span>";//工作经验 182 string experience = Regex.Match(body, basicInfoRegexStr5).Value; 183 info.Experience = string.IsNullOrEmpty(experience) ? "" : experience.Substring(experience.IndexOf(">") + 1, experience.IndexOf("</") - experience.IndexOf(">") - 1);//工作经验 184 185 string basicInfoRegexStr6 = "</span><span>(.*?)</span><spanclass=\"noborder\">";//最低学历 186 string education = Regex.Match(body, basicInfoRegexStr6).Value; 187 info.Education = string.IsNullOrEmpty(education) ? "" : education.Substring(education.IndexOf("<span>") + 6, education.IndexOf("</span><spanclass=") - education.IndexOf("<span>") - 6);//最低学历 188 189 string basicInfoRegexStr7 = "<pclass=\"job-main-title\">(.*?)<";//月薪 190 string salary = Regex.Match(body, basicInfoRegexStr7).Value; 191 info.Salary = string.IsNullOrEmpty(salary) ? "" : salary.Substring(salary.IndexOf(">") + 1, salary.LastIndexOf("<") - salary.IndexOf(">") - 1);//月薪 192 193 string timeInfoRegexStr = "<pclass=\"release-time\">发布时间:<em>(.*?)</em></p>";//发布时间 194 string time = Regex.Match(body, timeInfoRegexStr).Value; 195 info.Time = string.IsNullOrEmpty(time) ? "" : time.Substring(time.IndexOf("<em>") + 4, time.IndexOf("</em>") - time.IndexOf("<em>") - 4);//发布时间 196 197 if (GetJobEnd != null) 198 { 199 GetJobEnd(pageStr, info); 200 } 201 } 202 catch (Exception exMsg) 203 { 204 throw new Exception(exMsg.Message); 205 } 206 } 207 }
以上为解析猎聘网招聘信息的类。以下为猎聘网解析出的数据: