此篇文章写的是爬取了自己所关注的人,看看他们关注了哪些人,并将那些人的信息保存下来,利用集合改成了无重复版本。
# coding:utf-8
import requests
import json
# 引入mysql数据库
import pymysql
db = pymysql.connect(host='localhost', user='root', password='123456', port=3306, db='mysql') # 连接mysql数据库
cursor = db.cursor() # 建立游标
# 引入MongoDB数据库
import pymongo
client = pymongo.MongoClient('mongodb://localhost:27017/') # 连接MongoDB
zhihu = client.zhihu # database
collection = zhihu['zhihu']
user = {}
urls = set()
import redis
r = redis.Redis(host='127.0.0.1', port=6379, db=0)
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'max-age=0',
'cookie': '_zap=5fe3f81a-ee3e-40de-8989-e3ecc5c0c910; _xsrf=G6GqOLiyQBeSqjmTgu9tbJvIKcYF6m2j; d_c0="ABAvO72hsA-PTpHEn1TQ0Zgu3uz2O7VSZCI=|1562331596"; tst=r; q_c1=71631202849145ddbc528c6aa1433f8b|1565178385000|1562331750000; tgw_l7_route=7f546500f1123d2f6701ac7f30637fd6; capsion_ticket="2|1:0|10:1567232500|14:capsion_ticket|44:OTE4NzQzYjQxZjU4NDZiOThjYjJlMjdmNTk3ZmEzYzU=|122aa4288c2ff6a108044dd74b2a0879b84f89d7f53fa0f03c335caf8b68e940"; z_c0="2|1:0|10:1567232513|4:z_c0|92:Mi4xbTMzVEJBQUFBQUFBRUM4N3ZhR3dEeVlBQUFCZ0FsVk5BV0JYWGdCUkpUdHNkOEYwWC1veEdYUm1xT0lfTnB2OUR3|77bca6d79e85ea87426d1b3d5a722624b23cde1a6fa69f76ce05ee248998cc31"',
'referer': 'https://www.zhihu.com/',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
}
def get_json(i):
url = "https://www.zhihu.com/api/v4/members/li-zhu-80-77-62/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=" + str(i*20) + "&limit=20"
response = requests.get(url, headers=headers).text
return (response)
def parse(json_str):
text = json.loads(json_str)
for item in text['data']:
name = item['name']
headline = item['headline']
url = item['url']
gender = item['gender']
img_url = item['avatar_url']
urls.add(url)
# 插入信息
sql = 'INSERT INTO zhihu_copy2(name, headline, url, gender, img_url) VALUES (%s, %s, %s, %s, %s)'
cursor.execute(sql, (name, headline, url, gender, img_url))
db.commit() # 提交插入的信息
print('插入本条数据成功')
# 向MongoDB里插入数据
collection.insert_one({'name': name, 'headline': headline, 'url': url, 'gender': gender, 'img_url': img_url})
text = {'name': name, 'headline': headline, 'url': url, 'gender': gender, 'img_url': img_url}
new_text = json.dumps(text)
r.set('data', new_text)
print(name, '\t', headline, '\t', url, '\t', gender, '\t', img_url)
return urls
def get_url(urls):
user_urls = urls.copy()
for user_url in urls:
user_name = user_url.split('/')[4]
for i in range(10):
json_url = 'https://www.zhihu.com/api/v4/members/' + user_name + '/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset='+ str(i*20) + '&limit=20'
response = requests.get(json_url, headers=headers).text
text = json.loads(response)
try:
for item in text['data']:
name = item['name']
headline = item['headline']
url = item['url']
gender = item['gender']
img_url = item['avatar_url']
if url in user_urls:
continue
else:
user_urls.add(url)
sql = 'INSERT INTO zhihu_copy2(name, headline, url, gender, img_url) VALUES (%s, %s, %s, %s, %s)'
cursor.execute(sql, (name, headline, url, gender, img_url))
db.commit() # 提交插入的信息
# 向MongoDB里插入数据
collection.insert_one(
{'name': name, 'headline': headline, 'url': url, 'gender': gender, 'img_url': img_url})
text = {'name': name, 'headline': headline, 'url': url, 'gender': gender, 'img_url': img_url}
new_text = json.dumps(text)
r.set('data', new_text)
print('插入本条数据成功')
except:
continue
for i in range(10):
json_str = get_json(i)
urls = parse(json_str)
get_url(urls)
# print(urls)
db.close()
正在想办法进一步优化