java爬虫抓取知乎20万_[爬虫]抓取知乎百万用户信息之爬虫模块

本项目github地址:https://github.com/wangqifan/ZhiHu

UserManage是获取用户信息的爬虫模块

public classUserManage

{private stringhtml;private stringurl_token;

}

构造函数

用户主页的uRL格式为"https://www.zhihu.com/people/"+url_token+"/following";

public UserManage(stringurltoken)

{

url_token=urltoken;

}

先封装一个获取html页面的方法

private boolGetHtml()

{string url="https://www.zhihu.com/people/"+url_token+"/following";

html=HttpHelp.DownLoadString(url);return !string.IsNullOrEmpty(html);

}

拿到了html页面,接下来是剥取页面中的JSON,借助HtmlAgilityPack

public voidanalyse()

{if(GetHtml())

{try{

Stopwatch watch= newStopwatch();

watch.Start();

HtmlDocument doc= newHtmlAgilityPack.HtmlDocument();

doc.LoadHtml(html);

HtmlNode node= doc.GetElementbyId("data");

StringBuilder stringbuilder=new StringBuilder(node.GetAttributeValue("data-state", ""));

stringbuilder.Replace(""", "'");

stringbuilder.Replace("<", "

stringbuilder.Replace(">", ">");

watch.Stop();

Console.WriteLine("分析Html用了{0}毫秒", watch.ElapsedMilliseconds.ToString());

}catch(Exception ex)

{

Console.WriteLine(ex.ToString());

}

}

}

添加用户的关注列表的链接

private void GetUserFlowerandNext(stringjson)

{string foollowed = "https://www.zhihu.com/api/v4/members/" + url_token + "/followers?include=data%5B*%5D.answer_count%2Carticles_count%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=0&limit=20";string following = "https://www.zhihu.com/api/v4/members/" + url_token + "/followees?include=data%5B%2A%5D.answer_count%2Carticles_count%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit=20&offset=0";

RedisCore.PushIntoList(1, "nexturl", following);

RedisCore.PushIntoList(1, "nexturl", foollowed);

}

对json数据进一步剥取,只要用户的信息,借助JSON解析工具Newtonsoft.Json

private void GetUserInformation(stringjson)

{

JObject obj=JObject.Parse(json);string xpath = "['" + url_token + "']";

JToken tocken= obj.SelectToken("['entities']").SelectToken("['users']").SelectToken(xpath);

RedisCore.PushIntoList(2, "User", tocken.ToString());

}

现在来完成下analyse函数

public void analyse()

{

if (GetHtml())

{

try

{

Stopwatch watch = new Stopwatch();

watch.Start();

HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();

doc.LoadHtml(html);

HtmlNode node = doc.GetElementbyId("data");

StringBuilder stringbuilder =new StringBuilder(node.GetAttributeValue("data-state", ""));

stringbuilder.Replace(""", "'");

stringbuilder.Replace("

stringbuilder.Replace(">", ">");

GetUserInformation(stringbuilder.ToString());

GetUserFlowerandNext(stringbuilder.ToString());

watch.Stop();

Console.WriteLine("分析Html用了{0}毫秒", watch.ElapsedMilliseconds.ToString());

}

catch (Exception ex)

{

Console.WriteLine(ex.ToString());

}

}

}

}

UrlTask是从nexturl队列获取用户的关注列表的url,获取关注列表。服务器返回的Json的数据

封装一个对象的序列化和反序列化的类

public classSerializeHelper

{///

///对数据进行序列化///

///

///

public static string SerializeToString(objectvalue)

{returnJsonConvert.SerializeObject(value);

}///

///反序列化操作///

///

///

///

public static T DeserializeToObject(stringstr)

{return JsonConvert.DeserializeObject(str);

}

}

封装UrlTask类

public classUrlTask

{

private string url { get; set; }

private string JSONstring { get; set; }

public UrlTask(string_url)

{

url =_url;

}

}

添加一个获取资源的方法

private boolGetHtml()

{

JSONstring=HttpHelp.DownLoadString(url);

Console.WriteLine("Json下载完成");

return !string.IsNullOrEmpty(JSONstring);

}

解析json方法

public voidAnalyse()

{

try{

if(GetHtml())

{

Stopwatch watch = newStopwatch();

watch.Start();

followerResult result = SerializeHelper.DeserializeToObject(JSONstring);

if (!result.paging.is_end)

{

RedisCore.PushIntoList(1, "nexturl", result.paging.next);

}

foreach (var item inresult.data)

{

int type=Math.Abs(item.GetHashCode())% 3 + 3;

if (RedisCore.InsetIntoHash(type, "urltokenhash", item.url_token, "存在"))

{

RedisCore.PushIntoList(1, "urltoken", item.url_token);

}

}

watch.Stop();

Console.WriteLine("解析json用了{0}毫秒",watch.ElapsedMilliseconds.ToString());

}

}

catch(Exception ex)

{

Console.WriteLine(ex.ToString());

}

}

解析:如果result.paging.is_end为true,那么这个是用户关注列表的最后一页,那么它的nexturl应该加入队列,负责不要加入,对于后面的用户数组,因为信息不去全,不要了,有了Id前往主页获取详细信息。

模块组合

封装一个一个方法,从队列拿到nextutl,前往用户的关注列表,拿到更多用户ID

private static voidGetNexturl()

{

string nexturl = RedisCore.PopFromList(1, "nexturl");

if (!string.IsNullOrEmpty(nexturl))

{

UrlTask task = newUrlTask(nexturl);

task.Analyse();

}

}

封装一个方法,循环从队列获取用户的urltoken(如果队列空了,执行GetNexturl),前往用户主页,获取信息

private static void GetUser(objectdata)

{

while (true)

{

string url_token = RedisCore.PopFromList(1, "urltoken");

Console.WriteLine(url_token);

if (!string.IsNullOrEmpty(url_token))

{

UserManage manage = newUserManage(url_token);

manage.analyse();

}

else{

GetNexturl();

}

}

}

在main函数里面执行这些方法,由于任务量大,采用多线程,线程数视情况而定

for (int i = 0; i < 10; i++)

{

ThreadPool.QueueUserWorkItem(GetUser);

}

添加种子数据,用于刚开始时候队列都是空的,需要添加种子数据

手动添加,在redile-cl.exe敲命令

在main函数中加入

UserTask task=newUserTask(“某个用户的uRLtoken”);

task.analyse();

执行一次之后要注释掉,避免重复

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
可以使用 Python 的 Requests 和 BeautifulSoup 库来爬取知乎用户信息。首先需要登录知乎获取 cookie,然后通过模拟登录获取到用户的个人主页,再使用 BeautifulSoup 解析页面获取用户信息。 以下是示例代码: ```python import requests from bs4 import BeautifulSoup # 登录知乎并获取 cookie session = requests.Session() headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'} login_url = 'https://www.zhihu.com/signin' response = session.get(login_url, headers=headers) soup = BeautifulSoup(response.text, 'html.parser') _xsrf = soup.find('input', attrs={'name': '_xsrf'})['value'] captcha_url = soup.find('img', attrs={'class': 'Captcha-englishImg'})['src'] # 模拟登录获取用户信息 login_data = { '_xsrf': _xsrf, 'email': 'your_account', 'password': 'your_password', 'captcha': input('请输入验证码' + captcha_url), 'remember_me': 'true' } session.post(login_url, headers=headers, data=login_data) user_url = 'https://www.zhihu.com/people/username' response = session.get(user_url, headers=headers) soup = BeautifulSoup(response.text, 'html.parser') # 解析页面获取用户信息 name = soup.find('span', attrs={'class': 'ProfileHeader-name'}).text headline = soup.find('span', attrs={'class': 'RichText ztext ProfileHeader-headline'}).text description = soup.find('div', attrs={'class': 'ProfileHeader-infoItem ProfileHeader-description'}).find('span', attrs={'class': 'RichText ztext'}).text.strip() location = soup.find('div', attrs={'class': 'ProfileHeader-infoItem ProfileHeader-location'}).find('span', attrs={'class': 'ProfileHeader-detailValue'}).text.strip() business = soup.find('div', attrs={'class': 'ProfileHeader-infoItem ProfileHeader-business'}).find('span', attrs={'class': 'ProfileHeader-detailValue'}).text.strip() employment = soup.find('div', attrs={'class': 'ProfileHeader-infoItem ProfileHeader-employment'}).find('span', attrs={'class': 'ProfileHeader-detailValue'}).text.strip() position = soup.find('div', attrs={'class': 'ProfileHeader-infoItem ProfileHeader-position'}).find('span', attrs={'class': 'ProfileHeader-detailValue'}).text.strip() education = soup.find('div', attrs={'class': 'ProfileHeader-infoItem ProfileHeader-education'}).find('span', attrs={'class': 'ProfileHeader-detailValue'}).text.strip() major = soup.find('div', attrs={'class': 'ProfileHeader-infoItem ProfileHeader-major'}).find('span', attrs={'class': 'ProfileHeader-detailValue'}).text.strip() ``` 以上代码中,需要替换 `your_account` 和 `your_password` 为你的知乎登录账号和密码,并将 `username` 替换为你要爬取的用户的用户名。另外,为了防止被知乎爬虫机制检测到,最好加上一些随机的等待时间和 User-Agent 等信息。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值