csdn不在是之前的很多免费开源的站点了,很多大神装B收费了,这里针对51job前程无忧完整反扒,然后很多代码收费的问题,做了针对性解决,具体方法:C#获取浏览器数据然后保存本地,然后python读取,也可以针对其他类似反扒网站;
说明:python中也有类似模拟浏览器访问的情况,可以研究下!
ps:python所谓的爬虫,其实就是之前的post和get方法,很多网站都屏蔽了,没有想象中的那样高大上,只是python中的开源包比较多罢了,个人简介,不喜勿喷;
以下免费开源代码奉上,供大家学习参考使用;
C#利用webbrowser获取网页代码,并写入text:
public static void WriteTxt(string filePath, string massage)
{
FileStream fs = new FileStream(filePath, FileMode.OpenOrCreate);//创建写入文件
fs.Close();//关闭文件
if (File.Exists(filePath))
{
StreamWriter sw = new StreamWriter(filePath, false, Encoding.Default);
sw.Write(massage);
sw.Flush();
sw.Close();
sw.Dispose();
}
}
文本保存为html文件
public static void TxttoHtml(string txtFilePath,string htmlFilePath)
{
文本文件
//string txtFilePath =AppDomain.CurrentDomain.BaseDirectory + "/Files/Test.txt";
HTML文件
//string htmlFilePath=AppDomain.CurrentDomain.BaseDirectory + "/Files/Test.Html";
//读取文本文件流
FileStream stream = File.OpenRead(txtFilePath);
StreamReader reader = new StreamReader(stream, System.Text.Encoding.GetEncoding("GB2312"));
//定位到起始位置
reader.BaseStream.Position = 0;
//获取文本内容并替换特殊字符
string content = reader.ReadToEnd().Replace(" ", " ").Replace("\r\n", "<br>");
//将文本内容写入到HTML中
using (FileStream htmlstream = new FileStream(htmlFilePath, FileMode.Create))
using (StreamWriter htmlWriter = new StreamWriter(htmlstream, System.Text.Encoding.UTF8))
{
htmlWriter.Write(content);
}
}
webbrowser加载完毕读入网页:
private void webBrowser1_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
string message = webBrowser1.Document.Body.OuterHtml;
WriteTxt(@"D:\web.txt", message);
//TxttoHtml(@"D:\web.txt", @"D:\web.html");
File.Copy(@"D:\web.txt", @"D:\web.html",true);
}
C#form中加载代码:
public Form1()
{
InitializeComponent();
//string url = "https://trade.5ifund.com/pc/";
string url="https://search.51job.com/list/080200,000000,0000,00,9,99,%25E7%2589%25A9%25E6%25B5%2581,2,1.html?";
//webBrowser1.ScriptErrorsSuppressed = true;
this.webBrowser1.Url = new Uri(url);//指定url地址为百度首页
}
python中完整代码:
import json
import requests
from bs4 import BeautifulSoup
with open('web.html','r',encoding='gbk') as f:
html=BeautifulSoup(f,'html.parser')
html.list=html.find_all('div',attrs={'class':'e'})
# print(html)
# print(len(html.list))
# print(html.list)
job=[]
for i,item in enumerate(html.list):
try:
print(item.find_all('span',attrs={'class':'jname at'})[0].text)
print(item.find_all('span', attrs={'class': 'sal'})[0].text)
print(item.find_all('span', attrs={'class': 'd at'})[0].text)
# print(i)
# print(item.find('span',attrs={'class':'jname'}).text)
job.append({
'jobname':item.find_all('span',attrs={'class':'jname at'})[0].text,
'jobincome':item.find_all('span', attrs={'class': 'sal'})[0].text,
'jobrequire':item.find_all('span', attrs={'class': 'd at'})[0].text,
})
except:
continue
with open('web.json','w',encoding='utf-8') as f:
json.dump(job,f,indent=1,ensure_ascii=False)
json数据如下:
{
"jobname": "医药物流大客户经理",
"jobincome": "1-2万/月",
"jobrequire": "杭州-富阳区 | 3-4年经验 | 大专 | 招3人"
},
{
"jobname": "销售主管(物流)",
"jobincome": "0.8-1万/月",
"jobrequire": "杭州 | 2年经验 | 高中 | 招1人"
},