该代码可以爬取用户的个人信息(主要包括城市,爱好,参与的群组)、群组的详细信息和发布活动的参与活动。API版本后续会发布,感谢关注
```python
import csv
import requests
from bs4 import BeautifulSoup
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'}
'''
此处的循环就是一个用户一个用户进行的操作,循环的范围就是用户的id号
也可以用过postman软件直接申请访问,或者调用meetup专门给开发人员的api也可
'''
for i in range(40000,50000):
print('i=', i)
id = i
try:
url = 'https://www.meetup.com/members/' + str(i) + '/'
r = requests.get(url, headers=header)#无法获取url的源码就推出
except:
continue
#若成功获取网页源码,则开始解析
soup = BeautifulSoup(r.text, 'html.parser')
client_data = []
# id√
id = i
# 用户名√
try:
interests_mid = soup.find('li', class_="D_group small last").div.find_all('a')
interests = []
for k in range(0, len(interests_mid)):
interests.append(interests_mid[k].text.strip())
name = soup.find('h1', class_="flush--bottom text--display3").span.string
position = soup.find('div', class_="D_memberProfileContentItem").p.a.span.string
except:
continue
# 兴趣
client_data.append(id)
client_data.append(name)
client_data.append(position)
client_data.append(interests)
group1_len = 1
group2_len = 1
group3_len = 1
try:
group1 = soup.find_all('div', class_="D_group clearfix first groupinfo-widget")
except:
group1_len = 0
try:
group2 = soup.find_all('div', class_="D_group clearfix groupinfo-widget")
except:
group2_len = 0
try:
group3 = soup.find_all('div', class_="D_group clearfix display-none groupinfo-widget")
except:
group3_len = 0
if group1_len+group2_len+group3_len == 0:
continue#未参与任何群组的用户不统计
group = group1 + group2 + group3 # 用户参与的群组
group_join = [] # 参与的群组名称
group_join1 = []
group_join_web = [] # 参与的群组的网址
for s in range(0, len(group)):#搜集群组无误,开始找到每个群组元素的群组名称和网站
try:
group_join.append(
group[s].find('div', class_="figureset-description margin-bottom").h4.a.text.strip())
group_join_web.append(group[s].find('div', class_="figureset-description margin-bottom").h4.a['href'])
except:#找不到参与的群组,就放弃该群组往下找
continue
if len(group_join_web)==0:
continue#用户确确实实参与了群组,但是爬虫无法找到群组的信息,则直接跳过,同样忽略该用户,进行下一个id的信息爬取
group_join1 = group_join.copy()
for k in range(0, len(group_join_web)):
group_data = []
url = group_join_web[k]
try:
r = requests.get(url, headers=header)
soup = BeautifulSoup(r.text, 'html.parser')
group_intro = soup.find('span', class_="infoToggle-label").text.strip() + '|' + soup.find('div',
class_="group-description--wrapper").text.strip()
group_posi = soup.find('ul', class_="organizer-city").a.span.text.strip()
# 群组位置
r = requests.get(group_join_web[k] + 'events/past/', headers=header)
soup = BeautifulSoup(r.text, 'html.parser')
activity = []
activity1 = []
activity_web = soup.find_all('li', class_="list-item border--none")
if len(activity_web) == 0: # 找不到群组发布的活动就跳过,因为没有发布活动或者发布不全的活动数据没有用
continue
for i1 in range(0, len(activity_web)):
activity.append(activity_web[i1].find('a', class_="eventCardHead--title").text.strip())
activity1 = activity.copy()
group_data.append(group_join[k]) # 群组
group_data.append(group_join_web[k]) # 群组网站
group_data.append(group_posi) # 群组位置
group_data.append(group_intro) # 群组介绍
for k1 in range(0, len(activity_web)):
activity_data = []
try:
p_all = activity_web[k1].find_all('p', class_="text--small padding--top margin--halfBottom")
activity_detail = p_all[1].text.strip()
activity_date = activity_web[k1].find('span', class_="eventTimeDisplay-startDate").text
except:
activity1.remove(activity[k1])
continue
try:
activity_position = activity_web[k1].find('p',
class_="venueDisplay venueDisplay-venue-locationHidden padding--left-half text--secondary text--small").span.text.strip()
except AttributeError:
try:
activity_position = activity_web[k1].find('div',
class_="venueDisplay venueDisplay-venue padding--left-half text--secondary text--small").address.p.text.strip()
except AttributeError:
try:
activity_position = activity_web[k1].find('p',
class_="venueDisplay venueDisplay-venue-noVenue padding--left-half text--secondary text--small").span.text.strip()
except AttributeError:
activity1.remove(activity[k1])
continue
activity_data.append(group_join[k])
activity_data.append(activity[k1])
activity_data.append(activity_date)
activity_data.append(activity_position)
activity_data.append(activity_detail)
with open("activity_data.csv", 'a', newline='', encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(activity_data)
f.close()
if len(activity1)==0:
group_join1.remove(group_join[k])
continue
group_data.append(activity1) # 群组活动
with open("group_data.csv", 'a', newline='', encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(group_data)
f.close()
except:
group_join1.remove(group_join[k])
continue
if len(group_join1)==0:
continue
client_data.append(group_join1)
with open("client_data.csv", 'a', newline='', encoding="utf-8") as f: # 确认无误,存入用户数据
writer = csv.writer(f)
writer.writerow(client_data)
f.close()
# 此处写入文件,且成功爬取了所有的用户信息和对应的群组网址