import requests from bs4 import BeautifulSoup as bs4 #引入模块 url="http://www.baidu.com/s?ie=UTF-8&wd=%E6%98%8A%E5%A4%A9seo" res = requests.get(url) soup= bs4(res.text, "lxml") for z in soup.select('h3[class="t"]'): #获取百度的搜索结果list print(z.select('a')[0].get_text()) #获取百度title url = z.select('a')[0]['href'] #获取<span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/bai-du-jia-mi-di-zhi" title="View all posts in 百度加密地址" target="_blank">百度加密地址</a></span> print(requests.get(url).url) #获取搜索结果真实地址
1
2
3
4
5
6
7
8
9
10
11
12
|
import
requests
from
bs4
import
BeautifulSoup
as
bs4
#引入模块
url
=
"http://www.baidu.com/s?ie=UTF-8&wd=%E6%98%8A%E5%A4%A9seo"
res
=
requests
.
get
(
url
)
soup
=
bs4
(
res
.
text
,
"lxml"
)
for
z
in
soup
.
select
(
'h3[class="t"]'
)
:
#获取百度的搜索结果list
print
(
z
.
select
(
'a'
)
[
0
]
.
get_text
(
)
)
#获取百度title
url
=
z
.
select
(
'a'
)
[
0
]
[
'href'
]
#获取百度加密地址
print
(
requests
.
get
(
url
)
.
url
)
#获取搜索结果真实地址
|
# -*- coding: utf-8 -*- """ @Time: 2018/5/16 @Author: songhao @微信公众号: zero<span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/python" title="View all posts in python" target="_blank">python</a></span> @File: bdspider.py """ from urllib.parse import quote import requests from scrapy import Selector url1 = "http://www.baidu.com/s?wd={}".format(quote("雅昌")) r = requests.get(url1) # print(r.text) selector = Selector(text=r.text,type="html") # na = requests.get(url).url for a in selector.xpath('//h3[contains(@class, "t")]'): print("".join(a.xpath('.//text()').extract()).strip()) # print() urlen = a.xpath('./a/@href').extract_first() print(urlen) # HEAD: 只请求页面的首部。 print(requests.head(urlen).headers.get('Location'))
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
|
# -*- coding: utf-8 -*-
"""
@Time: 2018/5/16
@Author: songhao
@微信公众号: zeropython
@File: bdspider.py
"""
from
urllib
.
parse
import
quote
import
requests
from
scrapy
import
Selector
url1
=
"http://www.baidu.com/s?wd={}"
.
format
(
quote
(
"雅昌"
)
)
r
=
requests
.
get
(
url1
)
# print(r.text)
selector
=
Selector
(
text
=
r
.
text
,
type
=
"html"
)
# na = requests.get(url).url
for
a
in
selector
.
xpath
(
'//h3[contains(@class, "t")]'
)
:
print
(
""
.
join
(
a
.
xpath
(
'.//text()'
)
.
extract
(
)
)
.
strip
(
)
)
# print()
urlen
=
a
.
xpath
(
'./a/@href'
)
.
extract_first
(
)
print
(
urlen
)
# HEAD: 只请求页面的首部。
print
(
requests
.
head
(
urlen
)
.
headers
.
get
(
'Location'
)
)
|
其中 获取 嵌套标签的text可以参考
# 引入库文件 import requests import urllib.request bd_url = "https://www.baidu.com/link?url=mhMx_W4kSIqeHdckh0dvrBt4LDIxvTrf1XqoDQKAptW&" \ "ck=5341.10.0.0.0.203.232.0&shh=www.baidu.com&sht=baiduhome_pg&wd=&" \ "eqid=cd90b17a00034b1c000000035a645fd5" # 解密方法一: r = requests.get(bd_url) print(r.url) # 解密方法二: # python3下和Python2.7 下urlopen 引入不同 r = urllib.request.urlopen(bd_url) print(r.geturl())
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
# 引入库文件
import
requests
import
urllib
.
request
bd_url
=
"https://www.baidu.com/link?url=mhMx_W4kSIqeHdckh0dvrBt4LDIxvTrf1XqoDQKAptW&"
\
"ck=5341.10.0.0.0.203.232.0&shh=www.baidu.com&sht=baiduhome_pg&wd=&"
\
"eqid=cd90b17a00034b1c000000035a645fd5"
# 解密方法一:
r
=
requests
.
get
(
bd_url
)
print
(
r
.
url
)
# 解密方法二:
# python3下和Python2.7 下urlopen 引入不同
r
=
urllib
.
request
.
urlopen
(
bd_url
)
print
(
r
.
geturl
(
)
)
|