1、PDF转Word工具
from pdf2docx import Converter
cv = Converter(r"E:/AAA.PDF")
cv.convert("E:/AAA.docx", start=0, end=None)
cv.close()
2、百度下载【可爱猫咪】
import requests
import json
import os
import io
from PIL import Image
import random
import string
api_url = 'https://image.baidu.com/search/acjson'
query_word = '可爱猫咪' # 搜索关键字
page_num = 1 # 搜索的页数
num_per_page = 30 # 每页搜索结果数量
def get_random_string(length):
# 构造字符集
letters = string.ascii_lowercase
# 从字符集中随机选取 length 个字符
result_str = ''.join(random.choice(letters) for i in range(length))
return result_str
# 创建目录
if not os.path.exists(query_word):
os.mkdir(query_word)
print(f'已创建目录:{query_word}')
output_dir = './可爱猫咪/' # 图片保存路径
params = {
'tn': 'resultjson_com',
'ipn': 'rj',
'ct': 201326592,
'is': '',
'fp': 'result',
'queryWord': query_word,
'cl': 2,
'lm': -1,
'ie': 'utf-8',
'oe': 'utf-8',
'adpicid': '',
'st': -1,
'z': '',
'ic': 0,
'word': query_word,
's': '',
'se': '',
'tab': '',
'width': '',
'height': '',
'face': 0,
'istype': 2,
'qc': '',
'nc': 1,
'fr': '',
'pn': str((page_num - 1) * num_per_page),
'rn': str(num_per_page),
'gsm': '1e',
}
headers = {
'Referer': 'https://image.baidu.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
}
response = requests.get(api_url, params=params, headers=headers)
if response.status_code != 200:
print('请求失败')
else:
json_data = json.loads(response.text)
for data in json_data['data']:
if data.get('thumbURL') is not None:
try:
img_url = data['thumbURL']
response = requests.get(img_url, timeout=10)
img_data = response.content
img = Image.open(io.BytesIO(img_data))
#img_name = os.path.basename(img_url)
img_name = get_random_string(10)
print(img_name)
img.save(os.path.join(output_dir, img_name + '.jpg'))
print(f'{img_name} 保存成功')
except Exception as e:
print(f'保存失败:{e}')
3、获取热点内容
import requests
from bs4 import BeautifulSoup
url= 'https://tophub.today/n/Jb0vmloB1G'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
# 发送请求并获取页面内容
response = requests.get(url, headers=headers)
html = response.text
# 解析页面内容,提取信息
soup = BeautifulSoup(html, 'html.parser')
#hot_list = soup.find_all('div', class_='jc-c')[0]
hot_list = soup.find_all('tbody')[0]
sub_hot_list = hot_list.find_all('tr')
#print(sub_hot_list)
for hot in sub_hot_list:
# print(hot)
# 热点序号
serial = hot.find('td').text.strip()
#print(serial)
# 热点标题
title = hot.find('td', class_='al').text.strip()
#print(title)
# 热点链接
href = 'https://tophub.today' + hot.find('a')['href']
#print(href)
# 热点链接
view = hot.find_all('td')[2].get_text()
# print(view)
print(serial, title, href, view)
4、获取豆瓣电影
import requests
from bs4 import BeautifulSoup
url = 'https://movie.douban.com/chart'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
items = soup.find_all('div', class_='pl2')
for item in items:
title = item.find('a').text.strip() # 获取电影标题
link = item.find('a')['href'] # 获取电影链接
rating = item.find('span', class_='rating_nums').text.strip() # 获取电影评分
print(title + ' ' + rating + ' ' + link) # 输出电影信息
5、获取豆瓣电影保存至Excel:
import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook
# 发送请求并获取页面内容
url = 'https://movie.douban.com/chart'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
response = requests.get(url, headers=headers)
html = response.text
# 解析页面内容,提取信息
soup = BeautifulSoup(html, 'html.parser')
movie_list = soup.find_all('div', class_='pl2')
print(movie_list)
data = []
for movie in movie_list:
# 电影名称
name = movie.find('a').text.strip()
print(name)
# 电影评分
rating = movie.find('span', class_='rating_nums').text.strip()
print(rating)
# 电影链接
href = movie.find('a')['href']
print(href)
data.append([name, rating, href])
print(data)
# 将数据保存到 Excel 文件中
wb = Workbook()
ws = wb.active
ws.title = '豆瓣电影榜单'
# 增加表头
ws.append(['电影名称', '评分', '链接'])
# 写入数据
for row in data:
ws.append(row)
# 保存 Excel 文件
wb.save('豆瓣电影榜单.xlsx')
print('豆瓣电影榜单已保存到豆瓣电影榜单.xlsx')
6、下载视频
import requests
url = 'https://aweme.snssdk.com/aweme/v1/playwm/?video_id=视频ID&line=0'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299'
}
response = requests.get(url, headers=headers)
with open('video.mp4', 'wb') as f:
f.write(response.content)
7、获取微博热点新闻
import requests
from bs4 import BeautifulSoup
url = "https://weibo.cn/search/mblog?hideSearchFrame=&keyword=%E7%94%98%E8%82%83%E9%99%87%E5%8D%97&advancedfilter=1&hasori=1&starttime=20230530&endtime=20230530&sort=time&page="
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"
}
for i in range(1, 20): # 爬取前20页数据
resp = requests.get(url+str(i), headers=headers)
soup = BeautifulSoup(resp.content.decode('utf-8'), 'html.parser')
contents = soup.find_all(class_="c")
for content in contents:
if 'id=' not in str(content):
continue
text = content.get_text().strip()
href = content.find('a')['href']
print(text, href)