抓取veryins

 
# -*- coding=utf-8 -*-
import datetime
import bs4,time,requests,json
import pymysql


def ins_info():
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36 Edg/80.0.361.109'
    }
    cur.execute('select ins_mes.ins_code from ins_mes left join all_pic_link on ins_mes.ins_code=all_pic_link.ins_code  where all_pic_link.ins_code is null')
    results_code = cur.fetchall()
    print('获得所有未处理code')
    for result_code in results_code:
        err = 1
        print('开始读取数据组建链接')
        url_2 = 'https://www.veryins.com/p/' +result_code[0]
        print(url_2)
        while True:
            try:
                res = requests.get(url_2, headers=headers, timeout=10)
                print('连接成功')
                break
            except:
                print('获取网页失败,正在重试!')
                time.sleep(2)
        while True:
            try:
                soup = bs4.BeautifulSoup(res.content, 'lxml')  # 解析网页源码
                num = 1
                print('获取网页成功,正在分析图片地址')
                swiper_slide = soup.findAll(class_ = 'swiper-slide')
                if len(swiper_slide) == 0 :
                    while True:
                        try:
                            img_wrapper = soup.find(class_='imgwrapper').find('img').attrs['src'].replace('amp','')
                            cur.execute(
                                'insert into all_pic_link (ins_code,ins_pic_link) values(%s,%s)',
                                (result_code[0],img_wrapper))
                            db.commit()
                            print('已写入数据库第'+str(num)+'张')
                            break
                        except:
                            video_wrapper = soup.find(class_='imgwrapper').find('source').attrs['src'].replace('amp','')
                            cur.execute(
                                'insert into all_pic_link (ins_code,ins_pic_link) values(%s,%s)',
                                (result_code[0], video_wrapper))
                            db.commit()
                            print('已写入数据库第'+str(num)+'部')
                            break
                else:
                    for i in swiper_slide:
                        try:
                            img_link = i.find('img').attrs['src'].replace('amp','')
                            while True:
                                try:
                                    cur.execute(
                                        'insert into all_pic_link (ins_code,ins_pic_link) values(%s,%s)',
                                        (result_code[0], img_link))
                                    db.commit()
                                    print('已写入数据库第' + str(num) + '张')
                                    num += 1
                                    break
                                except:
                                    print('出错,回滚1')
                                    db.rollback()
                                    time.sleep(2)
                        except:
                            video_wrapper = i.find('source').attrs['src'].replace('amp','')
                            while True:
                                try:
                                    cur.execute(
                                        'insert into all_pic_link (ins_code,ins_pic_link) values(%s,%s)',
                                        (result_code[0], video_wrapper))
                                    db.commit()
                                    print('已写入数据库第' + str(num) + '部')
                                    break
                                except:
                                    print('出错,回滚2')
                                    db.rollback()
                                    time.sleep(2)
                comments_link = soup.findAll(class_='comment-txt')
                for i in comments_link:
                    herf_txt = i.find('a').get_text()
                    comments_txt = i.find('p').get_text()
                    while True:
                        try:
                            cur.execute(
                                'insert into all_comments (ins_code,ins_commenter,comments) values(%s,%s,%s)',
                                (result_code[0],herf_txt, comments_txt))
                            db.commit()
                            break
                        except :
                            print('出错,回滚3')
                            db.rollback()
                            time.sleep(2)
                article = soup.find(class_ = 'caption').get_text()
                while True:
                    try:
                        cur.execute(
                            'insert into all_articles (ins_code,articles) values(%s,%s)',
                            (result_code[0], article))
                        db.commit()
                        print('将文章写入数据库')
                        break
                    except :
                        print('出错,回滚4')
                        db.rollback()
                        time.sleep(2)
                break
            except:
                print('出错重试!')
                time.sleep(5)
                print('获取网页失败,正在重试第:' + str(err) + '次')
                err+=1
                while True:
                    try:
                        res = requests.get(url_2, headers=headers, timeout=10)
                        break
                    except:
                        time.sleep(2)
                if err >5:
                    break



def get_ins():
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'
    }
    cur.execute('select ins_number from ins_index')
    results = cur.fetchall()
    for ins_number in results:
        #judge = input(ins_number[0] + '的数据执行录入还是更新操作?')
        if ins_number[0] == 'gurl_anna' or ins_number[0] == 's647746' or ins_number[0] == 'tinbaby_123'or ins_number[0] == 'moonwangxiaoai'or ins_number[0] == '33333heart' or ins_number[0] == 'luohluo2019':
            judge = '更新'
        else:
            judge = '录入'
        url_1 = str(veryins_url + '/' + ins_number[0])
        print(url_1)
        if judge == '录入':    #playfile = open(ins_name[i] + '_index.txt', 'w+')
            num =1
            num_item = 2
            while num<= int(num_item):
                try:
                    res = requests.get(url_1, headers = headers,timeout=10)
                    load_mes = bs4.BeautifulSoup(res.content, 'lxml')#解析网页源码
                    all_item = load_mes.findAll(attrs={'class': "item"})
                    num_item = load_mes.findAll(attrs={'class': "count"})[0].get_text().split('帖子')[0]
                    print(num_item)
                    while num<= int(num_item):
                        for all_item_1 in all_item:
                            img_wrap = all_item_1.find(class_="img-wrap")
                            data_code = img_wrap.get('data-code')
                            img_p_link = str(r'https://www.veryins.com/p/' + data_code)
                            now_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                            if cur.execute('select * from ins_mes where ins_code = \'%s\';' %data_code) == 0:
                                cur.execute(
                                    'insert into ins_mes (ins_num,ins_code,ins_p_link,time_add,time_update) values(%s,%s,%s,%s,%s)',
                                    (ins_number,data_code,img_p_link, now_time, now_time))
                                db.commit()
                            print('已写入数据库'+str(num)+'条')
                            num +=1
                        uid_class_1 = load_mes.findAll(attrs={'class': "row"})[0].get('class')
                        uid_class = uid_class_1.findAll('div')[0].get('class').lower()
                        uid_num = load_mes.findAll('div')[5].get(uid_class)
                        next_cursor = load_mes.find(class_='list').get('next-cursor')  # 能否缩减?
                        while num<= int(num_item):
                            try:
                                post_mes = r'https://www.veryins.com/user/post?next=' +next_cursor + r'&uid=' +uid_num
                                print(post_mes)
                                while True:
                                    try:
                                        res1 = json.loads(requests.post(url=post_mes, headers=headers, timeout=10).text)
                                        break
                                    except:
                                        print('post失败,正在重试!')
                                print(res1)
                                for k in res1['nodes']:
                                    data_code = k['code']
                                    img_p_link = str(r'https://www.veryins.com/p/' + data_code)
                                    print(data_code)
                                    if cur.execute('select * from ins_mes where ins_code = \'%s\';' % data_code) == 0:
                                        cur.execute(
                                            'insert into ins_mes (ins_num,ins_code,ins_p_link,time_add,time_update) values(%s,%s,%s,%s,%s)',
                                            (ins_number, data_code, img_p_link, now_time, now_time))
                                        db.commit()
                                    print('已写入数据库' + str(num) + '条')
                                    num += 1
                                if str(res1['page_info']['has_next_page']) == 'True':
                                    next_cursor = res1['page_info']['end_cursor']
                                    time.sleep(3)
                                else:
                                    break
                            except:
                                print('加载更多失败,正在重试!')
                                uid_class = load_mes.findAll('div')[6].get('class')[0].lower()
                                uid_num = load_mes.findAll('div')[6].get(uid_class)
                                next_cursor = load_mes.find(class_='list').get('next-cursor')
                            time.sleep(2)
                except requests.exceptions.ConnectionError:
                    print('ConnectionError -- please wait 3 seconds')
                    time.sleep(3)
                except requests.exceptions.ChunkedEncodingError:
                    print('ChunkedEncodingError -- please wait 3 seconds')
                    time.sleep(3)
                except:
                    print('Unfortunitely -- An Unknow Error Happened, Please wait 3 seconds')
                    time.sleep(3)
        if judge == '更新':
            num = 1
            while True:
                try:
                    res = requests.get(url_1, headers=headers, timeout=10)
                    load_mes = bs4.BeautifulSoup(res.content, 'lxml')  # 解析网页源码
                    all_item = load_mes.findAll(attrs={'class': "item"})
                    num_item =load_mes.findAll(attrs={'class': "count"})[0].get_text().split('帖子')[0]
                    print(ins_number[0]+num_item)
                    num_db = cur.execute('select * from ins_mes where ins_num = \'%s\';' % ins_number[0])
                    print('验证')
                    if num_db >= int(num_item):
                        print('数据库最新,无需更新')
                        break
                    else:
                        update_num = int(num_item) - num_db
                        while num < update_num:
                            for all_item_1 in all_item:
                                img_wrap = all_item_1.find(class_="img-wrap")
                                data_code = img_wrap.get('data-code')
                                img_p_link = str(r'https://www.veryins.com/p/' + data_code)
                                now_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                                if cur.execute('select * from ins_mes where ins_code = \'%s\';' % data_code) == 0:
                                    cur.execute(
                                        'insert into ins_mes (ins_num,ins_code,ins_p_link,time_add,time_update) values(%s,%s,%s,%s,%s)',
                                        (ins_number, data_code, img_p_link, now_time, now_time))
                                    db.commit()
                                print('已写入数据库' + str(num) + '条')
                                num += 1
                            uid_class = load_mes.findAll('div')[5].get('class')[0].lower()
                            uid_num = load_mes.findAll('div')[5].get(uid_class)
                            next_cursor = load_mes.find(class_='list').get('next-cursor')  # 能否缩减?
                            while num <= int(num_item):
                                try:
                                    post_mes = r'https://www.veryins.com/user/post?next=' + next_cursor + r'&uid=' + uid_num
                                    print(post_mes)
                                    while True:
                                        try:
                                            res1 = json.loads(requests.post(url=post_mes, headers=headers,timeout = 10).text)
                                            break
                                        except:
                                            print('post失败,正在重试!')
                                            time.sleep(3)
                                        for k in res1['nodes']:
                                            data_code = k['code']
                                            img_p_link = str(r'https://www.veryins.com/p/' + data_code)
                                            print(data_code)
                                            if cur.execute(
                                                    'select * from ins_mes where ins_code = \'%s\';' % data_code) == 0:
                                                cur.execute(
                                                    'insert into ins_mes (ins_num,ins_code,ins_p_link,time_add,time_update) values(%s,%s,%s,%s,%s)',
                                                    (ins_number, data_code, img_p_link, now_time, now_time))
                                                db.commit()
                                            print('已写入数据库' + str(num) + '条')
                                            num += 1
                                        if str(res1['page_info']['has_next_page']) == 'True':
                                            next_cursor = res1['page_info']['end_cursor']
                                            time.sleep(3)
                                        else:
                                            break
                                except:
                                    print('加载更多失败,正在重试!')
                                    time.sleep(2)
                except requests.exceptions.ConnectionError:
                    print('ConnectionError -- please wait 3 seconds')
                    time.sleep(3)
                except requests.exceptions.ChunkedEncodingError:
                    print('ChunkedEncodingError -- please wait 3 seconds')
                    time.sleep(3)
                except:
                    print('Unfortunitely -- An Unknow Error Happened, Please wait 3 seconds')
                    time.sleep(3)



def add_ins():
    judge = input('是否新增ins博主')
    if judge == '是':
        while judge == '是':
            ins_number_1 = input('请输入ins number')
            if cur.execute('select * from ins_index where ins_number = \'%s\';' % ins_number_1) == 0:
                add_ins_link = veryins_url + '/' +ins_number_1
                while True:
                    try:
                        headers = {
                            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'
                        }
                        res = requests.get(add_ins_link, headers=headers, timeout=10)
                        soup = bs4.BeautifulSoup(res.content, 'lxml')  # 解析网页源码
                        ins_name_1 = soup.find(attrs={'id': "username"}).get('data-fullname')
                        now_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                        cur.execute('insert into ins_index (ins_name,ins_number,time_add,time_update) values(%s,%s,%s,%s)',
                                    (ins_name_1, ins_number_1, now_time, now_time))
                        db.commit()
                        print('添加成功')
                        break
                    except requests.exceptions.ConnectionError:
                        print('ConnectionError -- please wait 3 seconds')
                        time.sleep(3)
                    except requests.exceptions.ChunkedEncodingError:
                        print('ChunkedEncodingError -- please wait 3 seconds')
                        time.sleep(3)
                    except:
                        print('Unfortunitely -- An Unknow Error Happened, Please wait 3 seconds')
                        time.sleep(3)
            else:
                print('已存在该博主,是否继续输入新的ins博主?')
                judge = input()
    print('添加完成')

if __name__ == "__main__":
    veryins_url = 'https://www.veryins.com'
    db =pymysql.connect('localhost',user = 'root',passwd = 'toor',db = 'veryins')
    cur = db.cursor()
    ins_long = cur.execute('select * from ins_index')
    #add_ins()
    #get_ins()
    ins_info()
    db.close()

目前完成到爬取单个用户的所有帖子主页,下一步学习将链接写入数据库,并爬取每个帖子的链接

初学python,还未完成 ,继续补充。有大佬可以指正,感激不尽!

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值