百度翻译
import requests
import json
if __name__ == '__main__' :
url = 'https://fanyi.baidu.com/sug'
headers = {
'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
}
word = input ( 'enter a word:' )
data = {
'kw' : word
}
response = requests. post( url= url, headers= headers, data= data)
response_obj = response. json( )
path = './' + word + '.txt'
fp = open ( path, 'w' , encoding= 'utf-8' )
content = response_obj[ 'data' ] [ 0 ] [ 'v' ]
json. dump( content, fp= fp, ensure_ascii= False )
print ( 'over' )
"""
思路整体:
1.打开百度翻译
2.F12,右键-检查-Network,打开抓包工具,勾选XHR(XMLHttpRequest)->AJAX
3.输入要翻译的单词,比如:hero
4.找到sug,查看Headers - Request URL - Request Method - (POST) - Form Data
response = request.post(url=url, headers=headers, data=data)
5.查看Headers - Response-Headers 中的 content-type,如果是application/json
response.json()中的json()方法将会返回一个obj
6.查看Response,查看json数据格式
7.进行持久化存储
"""
网页生成器
import requests
if __name__ == '__main__' :
url = 'https://www.sogou.com/web?'
word = input ( '请输入你要搜索的关键字:' )
params = {
'query' : word
}
headers = {
'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
}
response = requests. get( url= url, headers= headers, params= params)
file = word + '.html'
with open ( file , 'w' , encoding= 'utf-8' ) as f:
f. write( response. text)
"""
整体思路:
1.在搜狗浏览器中搜索关键字
2.获取url,https://www.sogou.com/web?query=要搜索的关键字
3.将?后的参数提取出来,使用params构造一个字典
params = {
'query': word
}
4.持久化存储
"""
药监局
import requests
if __name__ == '__main__' :
url = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList'
headers = {
'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
}
data = {
'on' : 'true' ,
'page' : 1 ,
'pageSize' : 15 ,
'productName' : '' ,
'conditionType' : 1 ,
'applyname' : '' ,
'applysn' : ''
}
response_list = [ ]
response = requests. post( url= url, headers= headers, data= data) . json( )
for i in response[ 'list' ] :
response_list. append( i[ 'ID' ] )
detail_list = [ ]
for i in response_list:
data = {
'id' : i
}
detail_url = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById'
detail_response = requests. post( url= detail_url, data= data, headers= headers) . json( )
detail_dict = {
'法定代表人' : detail_response[ 'businessPerson' ] ,
'企业住址' : detail_response[ 'epsAddress' ]
}
detail_list. append( detail_dict)
for i in detail_list:
print ( i)
"""
整体思路:
1.先爬取药监局的官网
2.得到详情页的ID,用列表保存起来
3.遍历列表,将所有的ID值作为data的参数遍历
"""
KFC地址
import requests
if __name__ == '__main__' :
url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
headers = {
'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
}
area = input ( '请输入你要查询的地址(前20条):' )
data = {
'cname' : '' ,
'pid' : '' ,
'keyword' : area,
'pageIndex' : 1 ,
'pageSize' : 20
}
response = requests. post( url= url, headers= headers, data= data) . text
response_dict = eval ( response)
for i in response_dict[ 'Table1' ] :
print ( i[ 'rownum' ] , i[ 'storeName' ] )
豆瓣电影
import requests
if __name__ == '__main__' :
url = 'https://movie.douban.com/j/search_subjects'
headers = {
'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
}
params = {
'type' : 'tv' ,
'tag' : '热门' ,
'sort' : 'recommend' ,
'page_limit' : 20 ,
'page_start' : 0
}
response = requests. get( url= url, headers= headers, params= params) . json( )
num = 1
for i in response[ 'subjects' ] :
print ( num)
print ( '评分:' , i[ 'rate' ] , '片名:' , i[ 'title' ] )
num += 1
爬取糗图百科的首页图片
import requests
import os
import re
if __name__ == '__main__' :
if not os. path. exists( './Img' ) :
os. mkdir( './Img' )
url = 'https://www.qiushibaike.com/imgrank/'
headers = {
'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
}
response = requests. get( url= url, headers= headers) . text
ex = '<div class="thumb">.*?<img src="(.*?)" alt'
img_src = re. findall( ex, response, re. S)
for i in img_src:
detail_url = 'https:' + i
img_name = detail_url. split( '/' ) [ - 1 ]
img_data = requests. get( url= detail_url, headers= headers) . content
img_path = './Img/' + img_name
with open ( img_path, 'wb' ) as f:
f. write( img_data)
print ( img_name, '下载完成' )