python 获取百度搜索结果–so easy,破解百度加密-CSDN博客

本文链接：https://blog.csdn.net/songhao8080/article/details/103670339

Python

import requests from bs4 import BeautifulSoup as bs4 #引入模块 url="http://www.baidu.com/s?ie=UTF-8&wd=%E6%98%8A%E5%A4%A9seo" res = requests.get(url) soup= bs4(res.text, "lxml") for z in soup.select('h3[class="t"]'): #获取百度的搜索结果list print(z.select('a')[0].get_text()) #获取百度title url = z.select('a')[0]['href'] #获取<span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/bai-du-jia-mi-di-zhi" title="View all posts in 百度加密地址" target="_blank">百度加密地址</a></span> print(requests.get(url).url) #获取搜索结果真实地址

import requests

from bs4 import BeautifulSoup as bs4

#引入模块

url = "http://www.baidu.com/s?ie=UTF-8&wd=%E6%98%8A%E5%A4%A9seo"

res = requests . get ( url )

soup = bs4 ( res . text , "lxml" )

for z in soup . select ( 'h3[class="t"]' ) : #获取百度的搜索结果list

print ( z . select ( 'a' ) [ 0 ] . get_text ( ) ) #获取百度title

url = z . select ( 'a' ) [ 0 ] [ 'href' ] #获取百度加密地址

print ( requests . get ( url ) . url ) #获取搜索结果真实地址

Python

# -*- coding: utf-8 -*- """ @Time: 2018/5/16 @Author: songhao @微信公众号: zero<span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/python" title="View all posts in python" target="_blank">python</a></span> @File: bdspider.py """ from urllib.parse import quote import requests from scrapy import Selector url1 = "http://www.baidu.com/s?wd={}".format(quote("雅昌")) r = requests.get(url1) # print(r.text) selector = Selector(text=r.text,type="html") # na = requests.get(url).url for a in selector.xpath('//h3[contains(@class, "t")]'): print("".join(a.xpath('.//text()').extract()).strip()) # print() urlen = a.xpath('./a/@href').extract_first() print(urlen) # HEAD：只请求页面的首部。 print(requests.head(urlen).headers.get('Location'))

# -*- coding: utf-8 -*-

"""

@Time: 2018/5/16

@Author: songhao

@微信公众号: zeropython

@File: bdspider.py

"""

from urllib . parse import quote

import requests

from scrapy import Selector

url1 = "http://www.baidu.com/s?wd={}" . format ( quote ( "雅昌" ) )

r = requests . get ( url1 )

# print(r.text)

selector = Selector ( text = r . text , type = "html" )

# na = requests.get(url).url

for a in selector . xpath ( '//h3[contains(@class, "t")]' ) :

print ( "" . join ( a . xpath ( './/text()' ) . extract ( ) ) . strip ( ) )

# print()

urlen = a . xpath ( './a/@href' ) . extract_first ( )

print ( urlen )

# HEAD：只请求页面的首部。

print ( requests . head ( urlen ) . headers . get ( 'Location' ) )

其中获取嵌套标签的text可以参考

Scrapy提取嵌套标签的text

对于要提取嵌套标签所有内容的情况, 使用string或//text(), 注意两者区别 >>> fro…

Python

# 引入库文件 import requests import urllib.request bd_url = "https://www.baidu.com/link?url=mhMx_W4kSIqeHdckh0dvrBt4LDIxvTrf1XqoDQKAptW&" \ "ck=5341.10.0.0.0.203.232.0&shh=www.baidu.com&sht=baiduhome_pg&wd=&" \ "eqid=cd90b17a00034b1c000000035a645fd5" # 解密方法一： r = requests.get(bd_url) print(r.url) # 解密方法二： # python3下和Python2.7 下urlopen 引入不同 r = urllib.request.urlopen(bd_url) print(r.geturl())