爬取TensorFlow_API目录链接
- 爬取TensorFlow库的所有链接
- 把所有链接保存在data_list
- 代码还可以优化,但是有点懒了,能下载就好了
from pyquery import PyQuery as pq
import requests
headers={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36',
'authority': 'tensorflow.google.cn'
}
data_list=[]
save_path='E:\\python\\tf.json'
def menu_parse():
url="https://tensorflow.google.cn/api_docs/python/tf"
r=requests.get(url,headers=headers)
pq_doc=pq(r.text)
generator_1=pq_doc('div.devsite-mobile-nav-bottom ul[menu="_book"] >li.devsite-nav-item.devsite-nav-expandable').items()
for generator_2 in generator_1: #第一级目录
class_name1=generator_2.children("devsite-expandable-nav >div >span").text()
generator_2=generator_2.children('devsite-expandable-nav >ul >li').items()
child_list1=[]
for generator_3 in generator_2: #第二级目录
if(generator_3('.devsite-nav-item.devsite-nav-expandable')):
class_name2=generator_3.children('devsite-expandable-nav >div >span').text()
generator_3=generator_3.children('devsite-expandable-nav >ul >li').items()
child_list2=[]
for generator_4 in generator_3: #第三级目录
if(generator_4('.devsite-nav-item.devsite-nav-expandable')):
class_name3=generator_4.children('devsite-expandable-nav >div >span').text()
generator_4=generator_4.children('devsite-expandable-nav >ul >li').items()
child_list3=[]
for generator_5 in generator_4:#第四级目录
if(generator_5('.devsite-nav-item.devsite-nav-expandable')):
class_name4=generator_5.children('devsite-expandable-nav >div >span').text()
generator_5=generator_5.children('devsite-expandable-nav >ul >li').items()
child_list4=[]
for generator_6 in generator_5:
fun_name4=generator_6('span.devsite-nav-text').text()
url4=generator_6('a.devsite-nav-title').attr('href')
child_list4.append({'fun_name':fun_name4,'url':url4})
child_list3.append({'class_name4':class_name4,'child_list4':child_list4})
else:
fun_name3=generator_5('span.devsite-nav-text').text()
url3=generator_5('a.devsite-nav-title').attr('href')
child_list3.append({'fun_name':fun_name3,'url':url3})
child_list2.append({'class_name3':class_name3,'child_list3':child_list3})
else:
fun_name2=generator_4('span.devsite-nav-text').text()
url2=generator_4('a.devsite-nav-title').attr('href')
child_list2.append({'fun_name':fun_name2,'url':url2})
child_list1.append({'class_name2':class_name2, 'child_list2':child_list2})
else:
fun_name=generator_3('span.devsite-nav-text').text()
url=generator_3('a.devsite-nav-title').attr('href')
child_list1.append({'fun_name':fun_name,'url':url})
data_list.append({'class_name1':class_name1, 'child_list1':child_list1})
# break
menu_parse()
把data_list保存到本地文件里
- 主要为了检查是否爬取完整
# print(data_list)
with open(save_path, 'w') as file:
file.write(json.dumps(data_list, indent=2))
导入下载的链接
上面我把链接下载到了E:\python\tfds_data.json,这里的主要的目的是为了导入tfds_data
import json
save_path='E:\\python\\tf.json'
with open(save_path, 'r') as file:
directory = file.read()
data = json.loads(directory)
# print(data)
爬取内容
爬取html
import html2text as ht
from pyquery import PyQuery as pq
import requests
import os
base_url = 'https://tensorflow.google.cn'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36',
'authority': 'tensorflow.google.cn'
}
base_directory = "E:\\python\\tensorflow\\tensorflow"
if not os.path.exists(base_directory):
os.mkdir(base_directory)
def html_handle(html):
doc = pq(html)
doc1 = doc('div.devsite-article-body.clearfix')
# doc1.find('table.vertical-rules').remove() #删除Used in the guide Used in the tutorials
# print(doc1)
return doc1
def html2text(html_handle, class_path, fun_name):
text_maker = ht.HTML2Text()
text_maker.images_to_alt = True
text_maker.ignore_tables = True
text = text_maker.handle(html_handle)
path = os.path.join(class_path, fun_name+'.md')
with open(path, 'w', encoding='UTF-8') as f:
f.write(text)
def request(url, class_path, fun_name):
# print(url)
try:
response = requests.get(url, headers=headers, timeout=4)
if response.status_code == 200:
content=html_handle(response.text)
content=str(content)
html2text(content,class_path,fun_name)
except requests.exceptions.ConnectionError as e:
print('Error', e.args)
except requests.exceptions.MissingSchema as e:
print('Error1')
for data1 in data:
class_name = data1['class_name1'].replace(" ","")
print("正在下载:", class_name)
class_path_1 = os.path.join(base_directory, class_name)
if not os.path.exists(class_path_1):
os.mkdir(class_path_1)
for data2 in data1['child_list1']:
if(list(data2.keys())[0] == "fun_name"):
fun_name = data2['fun_name']
url = base_url+data2['url']
request(url, class_path_1, fun_name)
else:
class_name = data2['class_name2'].replace(" ","")
class_path_2 = os.path.join(class_path_1, class_name)
if not os.path.exists(class_path_2):
os.mkdir(class_path_2)
for data3 in data2['child_list2']:
if(list(data3.keys())[0] == "fun_name"):
fun_name = data3['fun_name']
url = base_url+data3['url']
request(url, class_path_2, fun_name)
else:
class_name = data3['class_name3'].replace(" ","")
class_path_3 = os.path.join(class_path_2, class_name)
if not os.path.exists(class_path_3):
os.mkdir(class_path_3)
for data4 in data3['child_list3']:
if(list(data4.keys())[0] == "fun_name"):
fun_name = data4['fun_name']
url = base_url+data4['url']
request(url, class_path_3, fun_name)
else:
class_name = data4['class_name4'].replace(" ","")
class_path_4 = os.path.join(class_path_3, class_name)
if not os.path.exists(class_path_4):
os.mkdir(class_path_4)
for data5 in data4['child_list4']:
fun_name = data5['fun_name']
url = base_url+data5['url']
request(url, class_path_4, fun_name)
print("下载完成")
正在下载: tf
正在下载: tf.audio
正在下载: tf.autograph
正在下载: tf.bitwise
正在下载: tf.compat
正在下载: tf.config
正在下载: tf.data
正在下载: tf.debugging
正在下载: tf.distribute
正在下载: tf.dtypes
正在下载: tf.errors
正在下载: tf.estimator
正在下载: tf.experimental
正在下载: tf.feature_column
正在下载: tf.graph_util
正在下载: tf.image
正在下载: tf.initializers
正在下载: tf.io
正在下载: tf.keras
正在下载: tf.linalg
正在下载: tf.lite
正在下载: tf.lookup
正在下载: tf.losses
正在下载: tf.math
正在下载: tf.metrics
正在下载: tf.nest
正在下载: tf.nn
正在下载: tf.optimizers
正在下载: tf.quantization
正在下载: tf.queue
正在下载: tf.ragged
正在下载: tf.random
正在下载: tf.raw_ops
正在下载: tf.saved_model
正在下载: tf.sets
正在下载: tf.signal
正在下载: tf.sparse
正在下载: tf.strings
正在下载: tf.summary
正在下载: tf.sysconfig
正在下载: tf.test
正在下载: tf.tpu
正在下载: tf.train
正在下载: tf.version
正在下载: tf.xla
下载完成