输入名字爬取百度搜索的网页源码(初级)
import requests
name = input("请输出要搜索人的名称:")
url = "https://www.baidu.com/s?wd=name"
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"
}
response = requests.get(url=url,headers=head)
html = response.text
print(html)
response.close()
获取百度翻译结果(初级)
import requests
url = "https://fanyi.baidu.com/sug"
word = input("请输入要翻译的英文单词:")
dat = {
"kw": word
}
resp = requests.post(url,data=dat)
print(resp.json())
resp.close()
爬取豆瓣电影排行榜第一页(初级)
import requests
url = "https://movie.douban.com/j/chart/top_list"
param = {
"type": "24",
"interval_id": "100:90",
"action": "",
"start": "0",
"limit": "20",
}
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"
}
resp = requests.get(url=url,params=param,headers=head)
print(resp.json())
resp.close()
爬取豆瓣电影TOP250 名字,年份,评分,评价人数
import re
import requests
import csv
url ="https://movie.douban.com/top250"
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"
}
rsp = requests.get(url=url,headers=head)
html = rsp.text
obj = re.compile(r'<li>.*?<div class="item">.*?<span class="title">(?P<name>.*?)'
r'</span>.*?<p class="">.*?<br>(?P<year>.*?) .*?'
r'<span class="rating_num" property="v:average">(?P<score>.*?)</span>.*?'
r'<span>(?P<number>.*?)</span>', re.S)
result = obj.finditer(html)
f = open("data.csv",mode="w")
csvwriter = csv.writer(f)
for it in result:
dic = it.groupdict()
dic['year'] = dic['year'].strip()
csvwriter.writerow(dic.values())
f.close()
爬取百度图片:
import requests
import re
import os
word = input("请输入搜索关键词(可以是人名,地名等): ")
url = 'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + '&pn=30'
head = {
'Access-Control-Allow-Credentials': 'true',
'Connection': 'keep-alive',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
}
resp = requests.get(url=url,headers=head)
resp.encoding = "utf-8"
html = resp.text
urls = re.findall('"thumbURL":"(.*?)"',html)
num = 0
file = input('请建立一个存储图片的文件夹,输入文件夹名称即可:')
os.mkdir(file)
for i in urls:
pic = requests.get(i, timeout=7)
string = file + r'\\' + word + '_' + str(num) + '.jpg'
fp = open(string, 'wb')
fp.write(pic.content)
num = num + 1