#coding:utf-8
import requests
from bs4 import BeautifulSoup
import lxml
import codecs
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
#得到所有页面的url
def get_url(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'}
request = requests.get(url, headers=headers)
html = request.text
all_url = []
soup = BeautifulSoup(html, 'lxml')
all_a = soup.find('div', class_='x-sidebar-left-content').find_all('a')
for a in all_a:
all_url.append('http://www.liaoxuefeng.com' + a.get('href'))
return all_url
#下载
def download_text(all_url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'}
for url in all_url: #浏览每一个页面,然后下载
re=requests.get(url,headers=headers)
html=re.text
soup=BeautifulSoup(html,'lxml')
all_h4=soup.find_all('h4')
div=soup.find('div',class_='x-wiki-content')
content=all_h4[0].text+div.text
if all_h4[0].text=='map/reduce': #这是判断里面的名字可能和路径相冲突
filename='g:\\file\\'+'map and reduce'+'.py'
elif all_h4[0].text==u'TCP/IP简介':
filename='g:\\file\\'+'TCP and IP'+'.py'
elif all_h4[0].text=='async/await':
filename='g:\\file\\'+'async and await'+'.py'
else:
filename='g:\\file\\'+all_h4[0].text+'.py'
print all_h4[0].text
print u'正在下载'+all_h4[0].text
with codecs.open(filename,'w',encoding='utf-8') as f: #将信息写入py文件中
f.write(content)
if __name__=="__main__":
url = 'http://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000'
all_url=get_url(url)
download_text(all_url)
python爬取廖雪峰的python3教程
最新推荐文章于 2020-10-28 21:08:18 发布