“”"
思路
爬取所有的单词
分页爬取
写入一个文件里面
然后写入json文件中
最后写入文档
“”"
import os
import requests
from lxml import etree
def python(page):
#起始的url
for page in range(1,int(page)+1):
path='./english/python'
if not os.path.exists(path):
os.makedirs(path)
base_url="https://www.shanbay.com/wordlist/110521/232414/?page={}".format(page)
response=requests.get(base_url)
html=response.text
# print(html)
#将字符串格式转换成xml格式
html_xml=etree.HTML(html)
#获取所有的单词
python_list=html_xml.xpath('//strong/text()')
print(python_list)
#获取所有的中文解释
china_list=html_xml.xpath('//td[@class="span10"]/text()')
print(china_list)
big_dic={
'{}页数'.format(page):{}
}
for i in range(len(china_list)):
dic = {}
dic['英文']=python_list[i]
dic['翻译']=china_list[i]
# print(dic)
# print(type(dic))#查看数据类型
big_dic.get('{}页数'.format(page))[python_list[i]]=china_list[i]
import json
path1=path+str(page)+'.txt'
with open(path1,'w',encoding='utf-8') as f:
f.write(json.dumps(str(big_dic)))
if __name__ == '__main__':
page=input('请输入要爬取的页数')
python(page)
第二种方式
‘’’
爬取扇贝网python必背词汇表
接口地址:https://www.shanbay.com/wordlist/110521/232414/
要求:获取所有的python词汇数据,形成字典,然后存储数据
思路:
第一页:https://www.shanbay.com/wordlist/110521/232414/
第二页:https://www.shanbay.com/wordlist/110521/232414/?page=2
‘’’
import requests,re,os,json
from lxml import etree
class Shanbei:
def __call__(self, *args, **kwargs):
self.get_xml(*args)
def get_xml(self,pages):
'''
获取扇贝英语python页 内容 转换为xml
:return:获取扇贝英语python页的xml
'''
base_url = 'https://www.shanbay.com/wordlist/110521/232414/?page='
for page in range(1,pages+1):
base_html = requests.get('https://www.shanbay.com/wordlist/110521/232414/?page=1').text
base_xml = etree.HTML(requests.get(base_url+str(page)).text)
pages = page
# print(base_xml)
with open('贝壳应英语.html','w',encoding='utf-8') as f :
f.write(base_html)
self.get_python(base_html,base_xml,pages)
def get_python(self,base_html,base_xml,pages):
'''
处理python页面数据
:param base_xml:
:return:
'''
#获取标题
# main_title = base_xml.xpath("/html/body/div[@class='container main-body ']/div[@class='row']/div[@class='span8']/div[@class='row'][1]/div[@class='span8']/div[@class='well']/div[@class='row']/div[@class='span6']/h4/text()")
#胆子获取
worlds_rule = re.compile(r'<h4>(.*?)</h4>',re.S)
main_title = worlds_rule.findall(base_html)
get_words = re.compile(r'[a-z][A-Z]+',re.I)
world_title = (get_words.findall(main_title[0]))[0]
# print(base_html)
print(world_title)
# #获取本页词条数量
trs_length = len(base_xml.xpath("/html/body/div[@class='container main-body ']/div[@class='row']/div[@class='span8']/div[@class='row'][2]/div[@class='span8']/table[@class='table table-bordered table-striped']/tbody/tr[@class='row']"))
print(trs_length)
#准备大字典
shell_word_dict = {
'必备单词{}'.format(world_title):{},
}
#根据词条数量获取循环次数
for line in range(int(trs_length)):
word = base_xml.xpath("/html/body/div[@class='container main-body ']/div[@class='row']/div[@class='span8']/div[@class='row'][2]/div[@class='span8']/table[@class='table table-bordered table-striped']/tbody/tr[@class='row'][{}]/td[@class='span2']/strong/text()".format(line+1))
word_description = base_xml.xpath("/html/body/div[@class='container main-body ']/div[@class='row']/div[@class='span8']/div[@class='row'][2]/div[@class='span8']/table[@class='table table-bordered table-striped']/tbody/tr[@class='row'][{}]/td[@class='span10']/text()".format(line+1))
# print(word,word_description)
shell_word_dict.get('必备单词{}'.format(world_title))[word[0]] = word_description[0]
print(shell_word_dict)
#储存文件
path = './扇贝单词/'+'第'+str(pages)+'页' +'/'
print(path)
if not os.path.exists(path):
os.makedirs(path)
file_path = path+'第'+str(pages)+'页'
with open(file_path,'w',encoding='utf-8')as f:
f.write(json.dumps(shell_word_dict))
if __name__ == '__main__':
# pages = int(input('请输入需要爬取的页数'))
pages = 3
shanbei = Shanbei()
shanbei(pages)