python进阶宝典9-操控web（爬虫与web测试等）

本文链接：https://blog.csdn.net/ebzxw/article/details/80215377

本文介绍如何使用Python中的webbrowser、requests、BeautifulSoup及selenium等库实现自动化网页浏览、内容抓取与交互。通过这些工具可以完成网页内容的下载、解析，甚至模拟浏览器行为来填写表单和点击链接。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

#! python3
## webbrowser打开页面
# Launches a map in the browser using an address from the command line or clipboard.
import webbrowser,sys,pyperclip
if len(sys.argv) > 1:
# Get address from command line.
address = ' '.join(sys.argv[1:])
else:
# Get address from clipboard.
address = pyperclip.paste()
webbrowser.open('https://www.google.com/maps/place/'+address)

## 用 requests 模块从web进行下载
# 替代python的urllib2模块，用起来太复杂
# pip install requests
import requests
res = requests.get('http://www.gutenberg.org/cache/epub/1112/pg1112.txt')
type(res)
res.status_code == requests.codes.ok
print(res.status_code)
len(res.text)
print(res.text[:250])

## 检查下载是否成功在response对象上用方法 raise_for_status()，如果下载文件出错，将抛出异常。
# 总是在requests.get()之后调用raise_for_status().
# 更优雅的处理错误，使用try except.
import requests
res = requests.get('http://www.abc.com/page_lost')
try:
res.raise_for_status()
except Exception as exc:
print('There was a problem: %s' % (exc))

## 下载文件保存到硬盘
import requests
res = requests.get('http://www.gutenberg.org/cache/epub/1112/pg1112.txt')
res.raise_for_status()
tfile = open('pg_get.txt','wb') # 使用 wb 写二进制模式打开文件，目的是保存该文本的Unicode编码
for chunk in res.iter_content(100000): # iter_content()方法在循环的每次迭代中，返回一定字节内容。
tfile.write(chunk)
tfile.close()

## 使用 BeautifulSoup 模块解析HTML
# pip install beautifulsoup4
import requests,bs4
res = requests.get('http://nostarch.com')
res.raise_for_status()
nosoup = bs4.BeautifulSoup(res.text)
type(nosoup)
print(nosoup)

##
efile = open('example.html')
nosoup = bs4.BeautifulSoup(efile.read()) # 可以直接解析本地网页文件
elems = nosoup.select('#author')
type(elems)
len(elems)
type(elems[0])
elems[0].getText()
str(elems[0])
elems[0].attrs

pelems = nosoup.select('p')
str(pelems[0])
pelems[0].getText()
str(pelems[1])
pelems[1].getText()

selems = nosoup.select('span')[0]
str(selems)
selems.get('id')
selems.attrs

## 打开页面，并自动打开页面上的5个链接
import requests,sys,webbrowser,bs4
print('Ebscn...')
res = requests.get('http://www.ebscn.com/')
res.raise_for_status()
len(res.text)
# Retrieve top search result links.
soup = bs4.BeautifulSoup(res.text)
print(soup)
# Open a browser tab for each result.
linkElems = soup.select('li a')
numOpen = min(5,len(linkElems))
for i in range(numOpen):
webbrowser.open(linkElems[i].get('href'))

## 漫画站下载
# 1.利用requests模块下载页面
# 2.利用Beautiful Soup找到页面中漫画图像的URL.
# 3.利用iter_content()下载漫画图像，并保存到硬盘
# 4.找到前一张漫画的链接URL，然后重复
import requests,os,bs4
url = 'http://xkcd.com' # starting url
os.makedirs('xkcd',exist_ok=True) # store comics in ./xkcd
while not url.endswith('#'):
# Download the page.
print('Downlaoding page %s...' % url)
res = requests.get(url)
res.raise_for_status()
# Find the URL of the comic image.
soup = bs4.BeautifulSoup(res.text)
comicElem = soup.select('#comic img')
if comicElem == []:
print('Could not find comic image.')
else:
comicUrl = 'http:'+comicElem[0].get('src')
# Download the image.
print('Downloading image %s...' % (comicUrl))
res = requests.get(comicUrl)
res.raise_for_status()
# Save the image to ./xkcd.
imageFile = open(os.path.join('xkcd',os.path.basename(comicUrl)),'wb')
for chunk in res.iter_content(100000):
imageFile.write(chunk)
imageFile.close()
# Get the Prev button's url.
prevLink = soup.select('a[rel="prev"]')[0]
url = 'http://xkcd.com' + prevLink.get('href')
print('Done.')

## 用 selenium 模块控制浏览器
# webdriver方法名返回的WebElement对象
# browser.find_element_by_class_name(name) 使用CSS类name的元素
# browser.find_element_by_css_selector(selector) 匹配CSS selector的元素
# browser.find_element_by_id(id) 匹配 id 属性值的元素
# browser.find_element_by_link_text(text) 完全匹配提供的text的<a>元素
# browser.find_element_by_partial_link_text(text) 包含提供的text的<a>元素
# browser.find_element_by_name(name) 匹配name属性值的元素
# browser.find_element_by_tag_name(name) 匹配标签name的元素
# 以上所有方法返回一个WebElement对象，代表页面中匹配查询的第一个元素
# browser.find_elements_* 返回所有匹配的对应元素
# 返回的WebElement对象也有各种属性和方法
#
# pip install selenium
from selenium import webdriver
# browser = webdriver.Firefox()
# browser = webdriver.Ie()
browser = webdriver.Chrome()
browser.get('http://inventwithpython.com')
try:
elem = browser.find_element_by_class_name('nav-link')
print('Found <%s> element with that class name!' % (elem.tag_name))
except:
print('Was not able to find an element with that name.')
# 点击页面
linkElem = browser.find_element_by_link_text('Read It Online')
type(linkElem)
linkElem.click()

## 填写并提交表单
from selenium import webdriver
browser = webdriver.Chrome()
browser.get('http://gmail.com')
emailElem = browser.find_element_by_id('Email') # 获取页面元素
emailElem.send_keys('not_my_real_email@gmail.com') # 填写内容
passwdElem = browser.find_element_by_id('Passwd')
passwdElem.send_keys('123456')
passwdElem.submit() # 提交，等同于页面表单的submit按钮

## 发送特殊键值
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
browser = webdriver.Chrome()
browser.get('http://nostarch.com')
htmlElem = browser.find_element_by_tag_name('html')
htmlElem.send_keys(Keys.END) # scrolls to bottom
htmlElem.send_keys(Keys.HOME) # scrolls to top

## 模拟浏览器按钮
browser.back()
browser.forward()
browser.refresh()
browser.quit()

## selenium 还可以修改cookie，截取页面快照，运行定制的JavaScript, 功能十分强大
# 具体参见 http://selenium-python.readthedocs.org