示例一:
import requests
if __name__ == "__main__":
url = "https://www.sogou.com"
res = requests.get(url=url)
page_text = res.text
print(page_text)
with open("./sogou.html", "w", encoding="utf-8") as fp:
fp.write(page_text)
print("爬取数据结束!")
示例二:
import requests
if __name__ == "__main__":
url = "https://www.sogou.com/web"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'
}
kw = input("input a word:")
param = {
'query': kw
}
res = requests.get(url=url, params=param, headers=headers)
page_text = res.text
fileName = kw+".html"
with open(fileName, "w", encoding="utf-8") as fp:
fp.write(page_text)
print("爬取数据结束!")
示例三:
需求:破解百度翻译
要点:1、POST请求;2、相应数据是一组JSON数据;
import requests
import json
if __name__ == "__main__":
url = "https://fanyi.baidu.com/v2transapi?from=zh&to=en"
#进行UA伪装
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'
}
kw = input("input a word:")
param = {
'query': kw,
'from': 'zh',
'to': 'en',
'transtype': 'translang',
'simple_means_flag': 3,
'sign': 525993.845208,
'token': '70d8c1b30159b2ea2811981cf885f1b1',
'domain': 'common'
}
res = requests.post(url=url, params=param, headers=headers)
dic_obj = res.json()
fileName = kw+".json"
fp = open(fileName, 'w', encoding="utf-8")
json.dump(dic_obj, fp=fp, ensure_ascii=False)
print("爬取数据结束!")
示例四:
爬取豆瓣电影分类排行榜
import requests
import json
if __name__=='__main__':
url='https://m.douban.com/rexxar/api/v2/niffler/collection/28/columns'
#UA伪装
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0'
}
params={
'start': '0',
'count': '2000',
'for_mobile': '1'
}
res=requests.get(url=url, params=params, headers=headers)
list_data=res.json()
fp=open('db21.json', 'w', encoding='utf-8')
json.dump(list_data, fp=fp, ensure_ascii=False)
print('数据抓取完成!')
示例五:
爬取国家食品药品监督管理总局化妆品相关数据(列表中的详细信息)
import requests
import json
if __name__=='__main__':
#UA伪造
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0'
}
#批量获取不同企业的ID值
url='http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList'
id_arr=[]
for page in range(1, 49):
page=str(page)
params={
'on': "true",
'page': page,
'pageSize': "15",
'productName': "",
'conditionType': "1",
'applyname': "",
'applysn': ""
}
list_data=requests.post(url=url, data=params, headers=headers).json()
for dic in list_data['list']:
id_arr.append(dic['ID'])
#获取企业详细数据
enterprise_url='http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById'
enterprise_arr=[]
for id in id_arr:
enterprise_params={
'id': id
}
enterprise_list_data=requests.post(url=enterprise_url, data=enterprise_params, headers=headers).json()
enterprise_arr.append(enterprise_list_data)
fp=open('db25.json', 'w', encoding='utf-8')
json.dump(enterprise_arr, fp=fp, ensure_ascii=False)
print('抓取完成')
示例五:聚焦爬虫
环境安装:
pip install bs4
pip install lxml
from bs4 import BeautifulSoup
import requests
if __name__=='__main__':
url='https://www.sogou.com/'
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'
}
sogou_text=requests.get(url=url, headers=headers).text
soup=BeautifulSoup(sogou_text, 'lxml')
print(soup.title)
with open('./sogou.html', 'w', encoding='utf-8') as fp:
fp.write(sogou_text)
print('sogou抓取完成!')
示例六:聚焦爬虫之使用BeautifulSoup抓取列表中的详细内容:
import requests
from bs4 import BeautifulSoup
if __name__=='__main__':
url='https://www.shicimingju.com/book/sanguoyanyi.html'
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0'
}
list_data=requests.get(url=url, headers=headers)
#解决抓取的数据乱码问题
list_data.encoding='utf-8'
list_soup=BeautifulSoup(list_data.text, 'lxml')
li_list=list_soup.select('.book-mulu > ul > li')
fp=open('./sanguo.html', 'w', encoding='utf-8')
for li in li_list:
title=li.text
print(title)
src='https://www.shicimingju.com'+li.a['href']
detail_data=requests.get(url=src, headers=headers)
#解决抓取的数据乱码问题
detail_data.encoding='utf-8'
detail_soup=BeautifulSoup(detail_data.text, 'lxml')
detail_text=detail_soup.find('div', class_='chapter_content').text
fp.write(title+'\n'+detail_text+'\n')
示例七:使用xpath爬取58同城中的列表数据
import requests
from lxml import etree
if __name__=='__main__':
url='https://cs.58.com/pinpaigongyu/'
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0'
}
page_text=requests.get(url=url, headers=headers).text
tree=etree.HTML(page_text)
li_list=tree.xpath('//ul[@class="list"]/li')
fp=open('./58.txt', 'w', encoding='utf-8')
for li in li_list:
title=li.xpath('./a//h2/text()')[0]
print(title)
fp.write(title+'\n')
示例八:xpath抓取多页图片数据
import os
import requests
from lxml import etree
if __name__ == '__main__':
if not os.path.exists('./fengjing'):
os.mkdir('./fengjing')
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0'
}
for i in range(9, 203):
url = 'https://pic.netbian.com/4kfengjing/index_' + str(i) + '.html'
list_data = requests.get(url=url, headers=headers).text
# print(list_data)
list_etree = etree.HTML(list_data)
li_tree = list_etree.xpath('//div[@class="slist"]/ul/li')
for li in li_tree:
img_title = li.xpath('./a/img/@alt')[0] + '.jpg'
img_title = img_title.encode('iso-8859-1').decode('gbk')
img_url = 'https://pic.netbian.com' + li.xpath('./a/img/@src')[0]
img_data = requests.get(url=img_url, headers=headers).content
img_path = './fengjing/' + img_title
with open(img_path, 'wb') as fp:
fp.write(img_data)
print(img_title + '抓取完成!')
示例九:xpath抓取页面中的热门城市和所有城市
import requests
from lxml import etree
if __name__ == '__main__':
url='https://www.aqistudy.cn/historydata/'
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0'
}
city_data = requests.get(url=url, headers=headers).text
city_etree = etree.HTML(city_data)
# //div[@class="hot"]//ul[@class="unstyled"]/li 热门城市li标签的层级关系
# //div[@class="all"]//ul[@class="unstyled"]//li 全部城市li标签的层级关系
li_tree = city_etree.xpath('//div[@class="hot"]//ul[@class="unstyled"]/li | //div[@class="all"]//ul[@class="unstyled"]//li')
all_city_names = []
for li in li_tree:
all_city_names.append(li.xpath('./a/text()')[0])
print(all_city_names)
print(len(all_city_names))
示例十:使用xpath抓取多页中对应的大图片
import requests
import os
from lxml import etree
if __name__ == '__main__':
if not os.path.exists('./chinaz-pic'):
os.mkdir('./chinaz-pic')
for i in range(26, 37):
url = 'https://sc.chinaz.com/tupian/fengjingtupian_' + str(i) + '.html'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0'
}
list_data = requests.get(url=url, headers=headers).text
list_data = list_data.encode('iso-8859-1').decode('utf-8')
list_etree = etree.HTML(list_data)
div_list_tree = list_etree.xpath('//div[@id="container"]/div')
print(div_list_tree)
for div in div_list_tree:
img_url = 'https:' + div.xpath('./div/a/@href')[0]
img_title = div.xpath('./div/a/img/@alt')[0] + '.jpg'
img_path = './chinaz-pic/' + img_title
img_data = requests.get(url=img_url, headers=headers).text
img_etree = etree.HTML(img_data)
img_tree = img_etree.xpath('//div[@class="down_img"]//a[@class="image_gall"]/@href')[0]
img_download_url = 'https:' + img_tree
img_download_data = requests.get(url=img_download_url, headers=headers).content
with open(img_path, 'wb') as fp:
fp.write(img_download_data)
print(img_title + '抓取完成...')