本例子只是简单的爬取用户的动态,但是会被豆瓣检测到是机器人,后续可以接入代理ip, 模拟浏览器请求。
代码:
import requests
import urllib
import json
import re
import os,sys
def findUserGroup(id):
url = 'https://www.douban.com/group/people/'+id+'/joins'
print(url)
data = requestTo(url)
groupIds = re.findall("<a href=\"(.*?)\"><img src=",data.text)
userName = re.findall("(.*?)<li class=\"loc\">", data.text)
for groupUrl in groupIds:
findGroupDetail(groupUrl, id)
print data.text
def findGroupDetail(url, id):
res_tr = '<tr class="">(.*?)</tr>'
title_href = '<a href="(.*?)" title='
pageNum = getPageNum(url)
for index in range(pageNum):
if(index > 2):
break
print index
url = url+'discussion?start='+str(index*25)