bs4基本使用
from bs4 import BeautifulSoup
from bs4.element import Tag
# 多加练习,Python,使用,知识变成你的了
data = '''<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>soup测试</title>
<title class="warm" id="hello" href="http://www.google.com/">你那温情的一笑,搞得我瑟瑟发抖</title>
</head>
<body>
<div class="tang">
<ul>
<li class="hello" id="world"><a id="world" href="http://www.baidu.com" title="出塞"><!--秦时明月汉时关,万里长征人未还,但使龙城飞将在,不教胡马度阴山--></a></li>
<list><a href="https://www.baidu.com" title="出塞" style="font-weight: bold"><!--秦时明月汉时关,万里长征人未还,但使龙城飞将在,不教胡马度阴山--></a></list>
<li><ul href="http://www.163.com" class="taohua" title="huahua">人面不知何处去,桃花依旧笑春风</ul></li>
<lists class="hello"><a href="http://mi.com" id="hong" title="huahua">去年今日此门中,人面桃花相映红</a></lists>
<li id="wo"><div href="http://qq.com" name="he" id="gu">故人西辞黄鹤楼,烟花三月下扬州</div></li>
</ul>
<ul>
<li class="hello" id="sf"><a href="http://www.baidu.com" title="出塞"><!--秦时明月汉时关,万里长征人未还,但使龙城飞将在,不教胡马度阴山--></a></li>
<list><a href="https://www.baidu.com" title="出塞"><!--秦时明月汉时关,万里长征人未还,但使龙城飞将在,不教胡马度阴山--></a></list>
<li><a href="http://www.163.com" class="taohua">人面不知何处去,桃花依旧笑春风</a></li>
<lists class="hello"><a href="http://mi.com" id="fhsf">去年今日此门中,人面桃花相映红,不知桃花何处去,出门依旧笑楚风</a></lists>
<li id="fs"><a href="http://qq.com" name="he" id="gufds">故人西辞黄鹤楼,烟花三月下扬州</a></li>
</ul>
</div>
<div id="meng">
<p class="jiang">
<span>三国猛将</span>
<ol>
<pl>关羽</pl>
<li>张飞</li>
<li>赵云</li>
<zl>马超</zl>
<li>黄忠</li>
</ol>
<div class="cao" id="h2">
<ul>
<li>典韦</li>
<li>许褚</li>
<li>张辽</li>
<li>张郃</li>
<li>于禁</li>
<li>夏侯惇</li>
</ul>
</div>
</p>
</div>
</body>
</html>'''
if __name__ == '__main__':
# 参数二,是解析器,lxml---->xpath
# soup整体,是Python对象:bs4.BeautifulSoup
soup = BeautifulSoup(data,'lxml')
# 查看soup结构
# Tag标签 bs4.element.Tag,属性方式进行查找,查到当前第一个
t = soup.title
print(t)
# print(type(t))
# bs4.element.NavigableString
s = soup.title.string
print(s)
# print(type(s))
# bs4.element.Comment
s = soup.li.string
print(s)
# print(type(s))
# 子节点,直接节点
# c = soup.body.div.ul.children
# print(c)
# for item in c:
# print(item)
ul = soup.body.div.ul
#孙节点
d = ul.children
for item in d:
print(item.string)
# 搜索文档树
# print(soup.find(name = 'li',id = 'wo'))
# # print(soup.find(name = 'li',attrs={'class':'hello','id':'world'}))
# lis = soup.find_all('li',class_ = 'hello')
# print(lis)
# css语法,soup.select()
# css语法通过标签查找
# ret = soup.select('li#wo')
# ret = soup.select('lists.hello a')
# print(ret)
# 通过类名查找
# print(soup.select('.hello'))
# 通过id查找 :#
# print(soup.select('#sf'))
# 组合查找
# print(soup.select('li#world'))
# 空格,找子标签
# print(soup.select('li #world'))
# print(soup.select('li a'))
# print(soup.select('div > ol > pl,zl'))
# 属性查找
# print(soup.select('div[class="cao"][id="h2"] > ul > li'))
# print(soup.select('div[class="cao"][id="h2"] > ul > li')[1])
# tag = soup.find_all(name='title')[1]
# print(tag)
# print(tag.name)
# print(tag.attrs)
# print(tag.attrs['href'])
# print(tag.string)
# print(tag['href'])
# print(tag.get_text())
jsonpath基本使用
import jsonpath
import json
data = '''{ "store": {
"book": [
{ "category": "reference",
"author": "李白",
"title": "Sayings of the Century",
"price": 8.95
},
{ "category": "fiction",
"author": "杜甫",
"title": "Sword of Honour",
"price": 12.99
},
{ "category": "fiction",
"author": "白居易",
"title": "Moby Dick",
"isbn": "0-553-21311-3",
"price": 8.99
},
{ "category": "fiction",
"author": "苏轼",
"title": "The Lord of the Rings",
"isbn": "0-395-19395-8",
"price": 22.99
}
],
"bicycle": {
"color": "red",
"price": 19.95
}
}
}'''
json_obj = json.loads(data, encoding='utf-8')
# print(jsonpath.jsonpath(json_obj,'$.store.book[*].author'))
# print(jsonpath.jsonpath(json_obj,'$..author'))
# print(jsonpath.jsonpath(json_obj,'$.store.book[?(@.price>12)]'))
# jsonpath 索引从0开始的
# print(jsonpath.jsonpath(json_obj,'$.store.book[0]'))
# @当前,当前列表长度 - 1 最后一个对象
# print(jsonpath.jsonpath(json_obj,'$.store.book[(@.length -1)]'))
print(jsonpath.jsonpath(json_obj,'$.store.book[?(@.isbn)]'))
selenium基本使用
导包:
from selenium import webdriver
url = 'http://www.qq.com/'
if __name__ == '__main__':
driver = webdriver.Chrome()#打开浏览器
driver.get(url)#获取url
time.sleep(2)
# driver.maximize_window()#窗口最大化
driver.set_window_size(x,y)#设置浏览器窗口大小
time.sleep(2)
driver.save_screenshot('./qq.png')#打开后截图保存名为qq的图片
driver.close()
driver.find_element_by_class_name("s_ipt").send_keys('macbook pro')#找到浏览器输入框输入MacBook pro
driver.find_element(By.ID,'su').click()#.click()是点击按钮
获取智联数据
driver = webdriver.Chrome()
driver.implicitly_wait(30)
driver.get(url)
time.sleep(5)
driver.find_element_by_xpath('//div[@class="risk-warning__content"]/button').click()
with open('./zhilian.html','w',encoding='utf-8') as fp:
fp.write(driver.page_source)#page_source方法是获取网页数据(html)格式
driver.back()#网页后退
driver.forward()#网页前进
当在网页打开两个网页的时候需要 windows handles
windows = driver.window_handles
print(windows)
driver._switch_to.window(windows[1])#进入第二个网页 ,下面的代码对第二个网页进行搜索操作
selenium–js
execute_script('$(arguments[0]).fadeOut()',img)#括号里面填 一些js语句实现操作
driver.execute_script("var q = document.getElementById(\"kw\");"
"q.style.border=\"2px solid red\";")
#一些网页登录时有手机号和账号密码登录 点击账号密码登录时是 iframe相当于另一个新窗口
# iframe 独立模块,相当于窗口
iframe = driver.find_element_by_xpath('//div[@class="login"]/iframe')
print('------------',iframe)
driver._switch_to.frame(iframe)
account = driver.find_element_by_class_name('account-tab-account')
print('+++++++++++',account)
account.click()
tesseract库的使用
用于识别图片内容
from PIL import Image
import pytesseract
if __name__ == '__main__':
image = Image.open('./chi.png')
# result = pytesseract.image_to_data(image)
result = pytesseract.image_to_string(image)
print(result)
#自动登录网页账号和识别验证码登录
from selenium import webdriver
import pytesseract
from PIL import Image
import time
url = 'https://so.gushiwen.org/user/login.aspx?from=http://so.gushiwen.org/user/collect.aspx'
def get_captcha():
driver = webdriver.Chrome()
driver.maximize_window()
driver.implicitly_wait(10)
driver.get(url)
# 元素
img = driver.find_element_by_id('imgCode')
img.screenshot('./captcha.png')
# driver.save_screenshot('./poem.png')
# image = Image.open('./poem.png')
# # 左上角坐标
# loc = img.location
# # 图片宽度高度
# size = img.size
# # 矩形区域
# # 160,260
# # 234,291
# rec = (loc['x'] + 32, loc['y'] + 54, 234, 291)
# captcha = image.crop(rec)
# 保存到文件中
# captcha.save('./captcha.png')
# return driver
return drive
def recognize_captcha():
captcha = Image.open('./captcha.png')
gray = captcha.convert('L')
data = gray.load()
w,h = captcha.size
for x in range(w):
for y in range(h):
# 0 ~ 255 0纯黑,255纯白
if data[x,y] < 140:
data[x,y] = 0
else:
data[x,y] = 255
code = pytesseract.image_to_string(gray)
return code
def login(drive,code):
drive.find_element_by_id('email').send_keys('455098435@qq.com')
drive.find_element_by_id('pwd').send_keys('31415926abc')
drive.find_element_by_id('code').send_keys(code)
time.sleep(1)
drive.find_element_by_id('denglu').click()
if __name__ == '__main__':
drive = get_captcha()
# code验证码,有可能出错
code = recognize_captcha()
print('----------------',code)
login(drive,code)