# -*- coding=utf-8 -*-
import datetime
import bs4,time,requests,json
import pymysql
def ins_info():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36 Edg/80.0.361.109'
}
cur.execute('select ins_mes.ins_code from ins_mes left join all_pic_link on ins_mes.ins_code=all_pic_link.ins_code where all_pic_link.ins_code is null')
results_code = cur.fetchall()
print('获得所有未处理code')
for result_code in results_code:
err = 1
print('开始读取数据组建链接')
url_2 = 'https://www.veryins.com/p/' +result_code[0]
print(url_2)
while True:
try:
res = requests.get(url_2, headers=headers, timeout=10)
print('连接成功')
break
except:
print('获取网页失败,正在重试!')
time.sleep(2)
while True:
try:
soup = bs4.BeautifulSoup(res.content, 'lxml') # 解析网页源码
num = 1
print('获取网页成功,正在分析图片地址')
swiper_slide = soup.findAll(class_ = 'swiper-slide')
if len(swiper_slide) == 0 :
while True:
try:
img_wrapper = soup.find(class_='imgwrapper').find('img').attrs['src'].replace('amp','')
cur.execute(
'insert into all_pic_link (ins_code,ins_pic_link) values(%s,%s)',
(result_code[0],img_wrapper))
db.commit()
print('已写入数据库第'+str(num)+'张')
break
except:
video_wrapper = soup.find(class_='imgwrapper').find('source').attrs['src'].replace('amp','')
cur.execute(
'insert into all_pic_link (ins_code,ins_pic_link) values(%s,%s)',
(result_code[0], video_wrapper))
db.commit()
print('已写入数据库第'+str(num)+'部')
break
else:
for i in swiper_slide:
try:
img_link = i.find('img').attrs['src'].replace('amp','')
while True:
try:
cur.execute(
'insert into all_pic_link (ins_code,ins_pic_link) values(%s,%s)',
(result_code[0], img_link))
db.commit()
print('已写入数据库第' + str(num) + '张')
num += 1
break
except:
print('出错,回滚1')
db.rollback()
time.sleep(2)
except:
video_wrapper = i.find('source').attrs['src'].replace('amp','')
while True:
try:
cur.execute(
'insert into all_pic_link (ins_code,ins_pic_link) values(%s,%s)',
(result_code[0], video_wrapper))
db.commit()
print('已写入数据库第' + str(num) + '部')
break
except:
print('出错,回滚2')
db.rollback()
time.sleep(2)
comments_link = soup.findAll(class_='comment-txt')
for i in comments_link:
herf_txt = i.find('a').get_text()
comments_txt = i.find('p').get_text()
while True:
try:
cur.execute(
'insert into all_comments (ins_code,ins_commenter,comments) values(%s,%s,%s)',
(result_code[0],herf_txt, comments_txt))
db.commit()
break
except :
print('出错,回滚3')
db.rollback()
time.sleep(2)
article = soup.find(class_ = 'caption').get_text()
while True:
try:
cur.execute(
'insert into all_articles (ins_code,articles) values(%s,%s)',
(result_code[0], article))
db.commit()
print('将文章写入数据库')
break
except :
print('出错,回滚4')
db.rollback()
time.sleep(2)
break
except:
print('出错重试!')
time.sleep(5)
print('获取网页失败,正在重试第:' + str(err) + '次')
err+=1
while True:
try:
res = requests.get(url_2, headers=headers, timeout=10)
break
except:
time.sleep(2)
if err >5:
break
def get_ins():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'
}
cur.execute('select ins_number from ins_index')
results = cur.fetchall()
for ins_number in results:
#judge = input(ins_number[0] + '的数据执行录入还是更新操作?')
if ins_number[0] == 'gurl_anna' or ins_number[0] == 's647746' or ins_number[0] == 'tinbaby_123'or ins_number[0] == 'moonwangxiaoai'or ins_number[0] == '33333heart' or ins_number[0] == 'luohluo2019':
judge = '更新'
else:
judge = '录入'
url_1 = str(veryins_url + '/' + ins_number[0])
print(url_1)
if judge == '录入': #playfile = open(ins_name[i] + '_index.txt', 'w+')
num =1
num_item = 2
while num<= int(num_item):
try:
res = requests.get(url_1, headers = headers,timeout=10)
load_mes = bs4.BeautifulSoup(res.content, 'lxml')#解析网页源码
all_item = load_mes.findAll(attrs={'class': "item"})
num_item = load_mes.findAll(attrs={'class': "count"})[0].get_text().split('帖子')[0]
print(num_item)
while num<= int(num_item):
for all_item_1 in all_item:
img_wrap = all_item_1.find(class_="img-wrap")
data_code = img_wrap.get('data-code')
img_p_link = str(r'https://www.veryins.com/p/' + data_code)
now_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
if cur.execute('select * from ins_mes where ins_code = \'%s\';' %data_code) == 0:
cur.execute(
'insert into ins_mes (ins_num,ins_code,ins_p_link,time_add,time_update) values(%s,%s,%s,%s,%s)',
(ins_number,data_code,img_p_link, now_time, now_time))
db.commit()
print('已写入数据库'+str(num)+'条')
num +=1
uid_class_1 = load_mes.findAll(attrs={'class': "row"})[0].get('class')
uid_class = uid_class_1.findAll('div')[0].get('class').lower()
uid_num = load_mes.findAll('div')[5].get(uid_class)
next_cursor = load_mes.find(class_='list').get('next-cursor') # 能否缩减?
while num<= int(num_item):
try:
post_mes = r'https://www.veryins.com/user/post?next=' +next_cursor + r'&uid=' +uid_num
print(post_mes)
while True:
try:
res1 = json.loads(requests.post(url=post_mes, headers=headers, timeout=10).text)
break
except:
print('post失败,正在重试!')
print(res1)
for k in res1['nodes']:
data_code = k['code']
img_p_link = str(r'https://www.veryins.com/p/' + data_code)
print(data_code)
if cur.execute('select * from ins_mes where ins_code = \'%s\';' % data_code) == 0:
cur.execute(
'insert into ins_mes (ins_num,ins_code,ins_p_link,time_add,time_update) values(%s,%s,%s,%s,%s)',
(ins_number, data_code, img_p_link, now_time, now_time))
db.commit()
print('已写入数据库' + str(num) + '条')
num += 1
if str(res1['page_info']['has_next_page']) == 'True':
next_cursor = res1['page_info']['end_cursor']
time.sleep(3)
else:
break
except:
print('加载更多失败,正在重试!')
uid_class = load_mes.findAll('div')[6].get('class')[0].lower()
uid_num = load_mes.findAll('div')[6].get(uid_class)
next_cursor = load_mes.find(class_='list').get('next-cursor')
time.sleep(2)
except requests.exceptions.ConnectionError:
print('ConnectionError -- please wait 3 seconds')
time.sleep(3)
except requests.exceptions.ChunkedEncodingError:
print('ChunkedEncodingError -- please wait 3 seconds')
time.sleep(3)
except:
print('Unfortunitely -- An Unknow Error Happened, Please wait 3 seconds')
time.sleep(3)
if judge == '更新':
num = 1
while True:
try:
res = requests.get(url_1, headers=headers, timeout=10)
load_mes = bs4.BeautifulSoup(res.content, 'lxml') # 解析网页源码
all_item = load_mes.findAll(attrs={'class': "item"})
num_item =load_mes.findAll(attrs={'class': "count"})[0].get_text().split('帖子')[0]
print(ins_number[0]+num_item)
num_db = cur.execute('select * from ins_mes where ins_num = \'%s\';' % ins_number[0])
print('验证')
if num_db >= int(num_item):
print('数据库最新,无需更新')
break
else:
update_num = int(num_item) - num_db
while num < update_num:
for all_item_1 in all_item:
img_wrap = all_item_1.find(class_="img-wrap")
data_code = img_wrap.get('data-code')
img_p_link = str(r'https://www.veryins.com/p/' + data_code)
now_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
if cur.execute('select * from ins_mes where ins_code = \'%s\';' % data_code) == 0:
cur.execute(
'insert into ins_mes (ins_num,ins_code,ins_p_link,time_add,time_update) values(%s,%s,%s,%s,%s)',
(ins_number, data_code, img_p_link, now_time, now_time))
db.commit()
print('已写入数据库' + str(num) + '条')
num += 1
uid_class = load_mes.findAll('div')[5].get('class')[0].lower()
uid_num = load_mes.findAll('div')[5].get(uid_class)
next_cursor = load_mes.find(class_='list').get('next-cursor') # 能否缩减?
while num <= int(num_item):
try:
post_mes = r'https://www.veryins.com/user/post?next=' + next_cursor + r'&uid=' + uid_num
print(post_mes)
while True:
try:
res1 = json.loads(requests.post(url=post_mes, headers=headers,timeout = 10).text)
break
except:
print('post失败,正在重试!')
time.sleep(3)
for k in res1['nodes']:
data_code = k['code']
img_p_link = str(r'https://www.veryins.com/p/' + data_code)
print(data_code)
if cur.execute(
'select * from ins_mes where ins_code = \'%s\';' % data_code) == 0:
cur.execute(
'insert into ins_mes (ins_num,ins_code,ins_p_link,time_add,time_update) values(%s,%s,%s,%s,%s)',
(ins_number, data_code, img_p_link, now_time, now_time))
db.commit()
print('已写入数据库' + str(num) + '条')
num += 1
if str(res1['page_info']['has_next_page']) == 'True':
next_cursor = res1['page_info']['end_cursor']
time.sleep(3)
else:
break
except:
print('加载更多失败,正在重试!')
time.sleep(2)
except requests.exceptions.ConnectionError:
print('ConnectionError -- please wait 3 seconds')
time.sleep(3)
except requests.exceptions.ChunkedEncodingError:
print('ChunkedEncodingError -- please wait 3 seconds')
time.sleep(3)
except:
print('Unfortunitely -- An Unknow Error Happened, Please wait 3 seconds')
time.sleep(3)
def add_ins():
judge = input('是否新增ins博主')
if judge == '是':
while judge == '是':
ins_number_1 = input('请输入ins number')
if cur.execute('select * from ins_index where ins_number = \'%s\';' % ins_number_1) == 0:
add_ins_link = veryins_url + '/' +ins_number_1
while True:
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'
}
res = requests.get(add_ins_link, headers=headers, timeout=10)
soup = bs4.BeautifulSoup(res.content, 'lxml') # 解析网页源码
ins_name_1 = soup.find(attrs={'id': "username"}).get('data-fullname')
now_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
cur.execute('insert into ins_index (ins_name,ins_number,time_add,time_update) values(%s,%s,%s,%s)',
(ins_name_1, ins_number_1, now_time, now_time))
db.commit()
print('添加成功')
break
except requests.exceptions.ConnectionError:
print('ConnectionError -- please wait 3 seconds')
time.sleep(3)
except requests.exceptions.ChunkedEncodingError:
print('ChunkedEncodingError -- please wait 3 seconds')
time.sleep(3)
except:
print('Unfortunitely -- An Unknow Error Happened, Please wait 3 seconds')
time.sleep(3)
else:
print('已存在该博主,是否继续输入新的ins博主?')
judge = input()
print('添加完成')
if __name__ == "__main__":
veryins_url = 'https://www.veryins.com'
db =pymysql.connect('localhost',user = 'root',passwd = 'toor',db = 'veryins')
cur = db.cursor()
ins_long = cur.execute('select * from ins_index')
#add_ins()
#get_ins()
ins_info()
db.close()
目前完成到爬取单个用户的所有帖子主页,下一步学习将链接写入数据库,并爬取每个帖子的链接
初学python,还未完成 ,继续补充。有大佬可以指正,感激不尽!