“21天好习惯”第一期- 13 爬虫综合

写了一段时间的爬虫,做了一个综合训练,目标是爬取微博热搜前六位,哔哩哔哩热门前六条视频,将其存入sqlite,爬取3G壁纸的六张图存入img文件夹,这个爬虫都是为了和前端项目对接起来,所以都是保存数据之类的。

下面是代码:

from sqlite3.dbapi2 import Cursor
import requests,sqlite3
from lxml import etree


def init_db(dbpath):
    sql1 = '''
    create table if not exists homepage
    (
        num integer primary key autoincrement,
        info text,
        href text,
        spot text
    )
    ''' 

    sql2 = '''
    create table if not exists bilibili
    (
        num integer primary key autoincrement,
        href text,
        title text,
        UP text
    )    
    '''

    if dbpath == 'homepage.db':
        sql = sql1
    elif dbpath == 'bilibili.db':
        sql = sql2
    conn = sqlite3.connect(dbpath)
    cursor = conn.cursor()
    cursor.execute(sql)
    conn.commit()
    conn.close()

def saveDB(all,dbpath):
    init_db(dbpath)
    conn = sqlite3.connect(dbpath)
    cur = conn.cursor()
    for i in all:
        for index in range(len(i)):
            i[index] = '"'+i[index]+'"'
        sql1 = '''
            replace into homepage (num,info,href,spot)
            values(%s)'''%",".join(i)
        sql2 = '''
            replace into bilibili (num,href,title,UP)
            values(%s)'''%",".join(i)
        if dbpath == 'homepage.db':
            sql = sql1
        elif dbpath == 'bilibili.db':
            sql = sql2
        cur.execute(sql)
        conn.commit()
    cur.close()
    conn.close()

def weibo(weiboUrl):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36 Edg/94.0.992.50',
        'Host':'s.weibo.com',
        'Cookie':'SINAGLOBAL=4435446268469.208.1609664272951; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WF4UWi41iI4GDPBa6GgZFZx5JpX5KMhUgL.FoMXSK2ReKqRSKB2dJLoIE-LxKqL122LBo2LxKqL122LBo2LxKML12zLB.eLxK-LBoBLB.xy; UOR=,,www.baidu.com; ALF=1667560015; SSOLoginState=1636024016; SCF=Ao0Qf2t3Yfn9GK1VfNg5-2r9kERJPIc-daOhF5J4LexM6jfQzt5Kcj7JAYugXGB7JDI4W3sb-sUYZJY5yn--ERI.; SUB=_2A25Mh86ADeRhGeFK7lMZ8SjEzjiIHXVv9KdIrDV8PUNbmtAKLVDzkW9NQ1qYCpQk7EGD3LFHOZuWS4ZLhxg5CJsB; _s_tentry=login.sina.com.cn; Apache=1400543527553.1685.1636024017310; ULV=1636024017316:25:3:4:1400543527553.1685.1636024017310:1635922688090'
        }
    res = requests.get(weiboUrl, headers=headers).text
    html = etree.HTML(res)
    items = html.xpath('//*[@id="pl_top_realtimehot"]/table/tbody/tr')
    all = []        
    i=0
    for item in items[1:7]:
        if(item.xpath('.//td[1]/text()') != ['•']):
            nums = item.xpath('.//td[1]/text()')
            infos = item.xpath('.//td[2]/a/text()')
            hrefs = item.xpath('.//td[2]/a/@href')
            spots = item.xpath('.//td[2]/span/text()')
            for i in range(len(nums)):
                num = nums[i]
                info = infos[i]
                href = hrefs[i]
                spot = spots[i]
            all.append([num,info,href,spot])
            i+=1
    return all

def bilibili(bilibiliUrl):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.30',
        'Cookie':"_uuid=FB826D72-0DD4-6529-66C0-B41DA61A7E5159617infoc; buvid3=C44CFAFD-DABB-4176-B4A4-72C3089AF23C143085infoc; sid=blhheuhy; rpdid=|(JYYJYY|)JR0J'uY|Rmmlmm~; LIVE_BUVID=AUTO7316053400914985; fingerprint3=06575671fef0d32400ca8fc17ec0c1a1; buivd_fp=C44CFAFD-DABB-4176-B4A4-72C3089AF23C143085infoc; buvid_fp=C44CFAFD-DABB-4176-B4A4-72C3089AF23C143085infoc; blackside_state=1; fingerprint_s=4e211682af48f357f684155aad630a52; DedeUserID=174976202; DedeUserID__ckMd5=e0601685fccfe74f; CURRENT_BLACKGAP=1; CURRENT_QUALITY=64; fingerprint=9276324110ba055e6f9e7391e8e845b4; buvid_fp_plain=87CE8CFB-FFAF-40F5-B7EC-8E8A8D73C6A0167631infoc; SESSDATA=122bc1da,1647848602,f8e50*91; bili_jct=cda1b5d8bed4824c7bdb2f00518d5531; CURRENT_FNVAL=976; PVID=1; bp_video_offset_174976202=584236026094520654; bp_t_offset_174976202=584640998555479923; bsource=search_baidu; innersign=0",
        'referer':'https://www.bilibili.com/'
    }
    res = requests.get(bilibiliUrl, headers=headers).text
    html = etree.HTML(res)
    items = html.xpath('//*[@id="app"]/div/div[2]/div[1]/div[2]/div[1]/div[position()>0 and position()<7]')
    all = []
    j=1      
    for item in items[0:6]:             
        hrefs = item.xpath('.//div/a/@href')
        titles = item.xpath('.//div/a/img/@alt')
        UPs = item.xpath('.//div/a/div/p[2]/text()')
        for i in range(len(hrefs)):
            num = str(j)
            j+=1
            href = hrefs[i]
            title = titles[i]
            UP = UPs[i]        
        all.append([num,href,title,UP])    
    return all



def picture():
    headers = {
        'Cookie': 'Hm_lvt_c8263f264e5db13b29b03baeb1840f60=1632291839,1632373348; Hm_lpvt_c8263f264e5db13b29b03baeb1840f60=1632373697',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'}
    url = f'https://www.3gbizhi.com/tupian/fjtp.html'
    response = requests.get(url, headers=headers)
    html = etree.HTML(response.text)
    href_list = html.xpath('//div[@class="contlistw mtw"]//ul[@class="cl"]/li/a/@href')
    for href in href_list:
        res = requests.get(href, headers=headers)
        html_data = etree.HTML(res.text)
        img_url_list = html_data.xpath('//div[@class="picimglist pos"]/ul/li/a/img/@src')
        num = 0
        for img_url in img_url_list:
            img_url = ''.join(img_url.split('thumb_200_0_'))
            result = requests.get(img_url, headers=headers).content
            with open('static/img/' + str(num) + '.jpg', 'wb')as f:
                f.write(result)
            num += 1
            if num==6:
                break
            print(f'正在下载第{num}张!!!!')
        break



if __name__ == "__main__":
    weiboUrl = 'https://s.weibo.com/top/summary'
    dbpath1 = 'homepage.db'
    all1 = weibo(weiboUrl)
    saveDB(all1,dbpath1)

    bilibiliUrl = 'https://www.bilibili.com/'
    dbpath2 = 'bilibili.db'
    all2 = bilibili(bilibiliUrl)
    saveDB(all2,dbpath2)

    picture()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值