7、批量关键字百度搜索结果url解码

import requests
from bs4 import BeautifulSoup
import re
import time

#coding:utf-8

with open('key.txt','r') as f:
    result = f.read()
keys = result.split('\n')
key_words = list(enumerate(keys, start=1))


for key in key_words:
    url = 'https://www.baidu.com/s?wd='+ key[1]

    header = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36',
        'Cookie':'PSTM=1476231684; BIDUPSID=4F526560482E2A5E68D69CC8B0998806; plus_cv=1::m:92e3c68f; BAIDUID=C5A710455602AEA5BEC3D1B13B26321B:FG=1;'
                 ' BDUSS=W5zS3JSeVYwSHZjVm5SdTdjQjlKNC1FLWJqbklvaEptZjVZVkl2bXhMN1o1amhZSVFBQUFBJCQAAAAAAAAAAAEAAACj2nZjanVleWluZ3MAAAAAAAAAAAAAAAAAAAA'
                 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANlZEVjZWRFYT; BD_HOME=1; BD_UPN=12314353; sug=3; sugstore=0; ORIGIN=2; bdime=0;'
                 ' H_PS_645EC=78d5XI4%2Bj6NkSjLKSmkiYdx%2F5jHNa0c4UemYz6WwEpyczIPebiQwaLtzwnXd2gUHv28P; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; BD_CK_SAM=1;'
                 ' PSINO=6; H_PS_PSSID=1448_18288_21112_17001_20241_21455_21406_21394_21377_21192_20929; BDSVRTM=0'
    }

    web_db = requests.get(url,headers=header)
    time.sleep(2)
    soup = BeautifulSoup(web_db.text,'lxml')

    titles = soup.select('#content_left > div > h3 > a')
    ranks = [ i for i in range(1,11)]

    for title,link,rank in zip(titles,titles,ranks):


        baidu_url = link.get('href')
        if str(baidu_url).find('link?url=') > 0 :
            web_db2 = requests.get(baidu_url, allow_redirects=False)
            if web_db2.status_code == 200:
                soup = BeautifulSoup(web_db2.text, 'lxml')
                urls = soup.select('head > noscript')
                url2 = urls[0]
                url_math = re.search(r'\'(.*?)\'', str(url2), re.S)
                web_url = url_math.group(1)
            elif web_db2.status_code == 302:
                web_url = web_db2.headers['location']
            else:
                web_url = 'error'
        else:
            web_url = baidu_url


        data = {
            'key':key,
            'title':title.get_text(),
            'url':web_url.encode('utf-8'),
            'rank':rank,
        }
        with open('info.txt','a') as f:
            f.write(str(data)+'\n')
    print('已完成采集任务' + str(key[0]) + '**********总采集任务' + str(len(key_words)))


  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值