spider_review
import requests
session = requests.Session()
session.verify = False
session.headers.update({
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
})
resp = session.get('https://movie.douban.com/top250')
print()
print()
正则表达式的应用
import re
import openpyxl
import requests
from openpyxl.cell import Cell
from openpyxl.styles import Font, Alignment
pattern = re.compile(r'<a\s.*?href="(?P<foo>.*?)".*?title="(?P<bar>.*?)".*?>')
resp = requests.get('https://www.sohu.com/')
iter_obj = pattern.finditer(resp.text)
wb = openpyxl.Workbook()
sheet = wb.active
sheet.title = '搜狐新闻'
sheet.row_dimensions[1].height = 35
sheet.column_dimensions['A'].width = 80
sheet.column_dimensions['B'].width = 120
sheet.cell(1, 1, '标题')
sheet.cell(1, 2, '链接')
font = Font(size=18, name='华文楷体', bold=True, color='ff0000')
alignment = Alignment(horizontal='center', vertical='center')
for col_index in 'AB':
curr_cell = sheet[f'{col_index}1']
curr_cell.font = font
curr_cell.alignment = alignment
for index, matcher in enumerate(iter_obj):
sheet.cell(index + 2, 1, matcher.group('bar'))
sheet.cell(index + 2, 2, matcher.group('foo'))
wb.save('files/爬虫数据文件.xlsx')
破解selenium反爬最重要的一行代码
from selenium import webdriver
browser = webdriver.Chrome()
browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': 'Object.defineProperty(navigator, "webdriver", {get: () => undefined})'
})
browser.get('https://www.baidu.com')
browser.implicitly_wait(10)
anchor = browser.find_element_by_css_selector('#s-top-left > a:nth-child(7)')
print(anchor.is_displayed())
print(anchor.size)
print(anchor.location)
browser.get_screenshot_as_file('files/baidu.png')
光学文字识别
import easyocr
reader = easyocr.Reader(['ch_sim', 'en'], gpu=False)
print(reader.readtext('files/idcard.jpg', detail=0))
从页面上抠图
from PIL import Image as img
from PIL import ImageFilter
from PIL.Image import Image
image = img.open('files/idcard.jpg')
print(image.size)
emboss_image = image.filter(ImageFilter.EMBOSS)
emboss_image.show()
head = image.crop((320, 50, 460, 235))
head.save('files/head.jpg')