from urllib import request from bs4 import BeautifulSoup #(一)获取网页内容 base_url = "http://langlang2017.com/route.html" response = request.urlopen(base_url) html = response.read()#二进制内容,属于非格式化内容 # html = html.decode("utf-8")#BeautifulSoup会自动转码 # print(html) #(二)数据解析(从页面内容当中提取数据) #1.创建BeautifulSoup对象 #(1)读取文件内容 soup=BeautifulSoup(html,'lxml') #2.格式化输出soup对象中的内容 content=soup.prettify() # print(content) # print(soup)#业余输出方法 #(三)提取指定数据 # print(soup.title) # print(soup.meta) # print(soup.link) # print(soup.img) #1.tag #(1)name # print(soup.name)#[document] # print(soup.title) # print(soup.title.name) # print(soup.head.name) #(2)attrs # print(soup.title.attrs) # print(soup.meta.attrs) img=soup.img.attrs # print(img['src']) #(3)修改属性值 # print(img) domain='http://www.langlang2017.com/' img['src']=domain+img['src'] # print(img) #(4)删除 img = soup.img.attrs del img["alt"] # print(img) #2.NavigableString # # 格式:标签名.string # print(soup.title.string) #3.BeautifulSoup # print(soup.name) # print(type(soup.name))#<class 'srt'> #4.comments #不包含注释 #(四)遍历文档---解决soup.标签只能匹配到第一个标签内容,后面的匹配不到的问题 # (1)标签名.contents 获取子节点列表 # print(soup.head) # print(type(soup.head)) # head= soup.head.contents # # print(head) # print(head[3]) # print(head[5]) # (2)标签名.children--子节点 # head_children = soup.head.children # for item in head_children: # print(item) #(3)标签名.descendants--子孙节点 # print(soup.div) # for item in soup.div.descendants: # if item != '\n': # print(item) # print('~~~~~~~~~~~~~~~~~~~~华丽的分隔线~~~~~~~~~~~~~~~~~~~~~~~~') # (五)搜索文档 #(1)name参数 #(1.2)字符串 # for meta in soup.find_all('meta'): # print(meta) #(1.2)正则 import re # pattern=re.compile('^m') # pattern=re.compile(r'<meta') # for item in soup.find_all(pattern): # print(item) #(1.3)列表 # print(soup.find_all(['h1','h2','h3','h4','h5'])) #(2)关键词 # print(soup.find_all(id='weixin')) # print(soup.find_all(id='taobao')) # (3)搜索文本 # print(soup.find_all(text='预订')) # print(soup.find_all(text=["预定","大连哈仙岛旅游--来岛路线"])) # print(soup.find_all(text=re.compile("路线"))) # (六)、CSS选择器--soup.select() # 1.通过标签名查找 # print(soup.select('meta')) #2.类名查找 #格式:.类名 # print(soup.select('.center')) # print(soup.select('.gaotie')) # print(soup.select('.routeway')) #3.id查找 #格式:#id # print(soup.select('#weixin')) #4.组合查找 # print(soup.select('div #taobao')) # print(soup.select('head > title')) #5.属性查找 # print(soup.select('div img[class="jianjieditu"]')) #6获取内容 # print(soup.select('h2')[0].get_text()) # for h2 in soup.select('h2'): # print(h2.get_text()) # children=soup.select() #test 1、拿到:先消费后付款,主动权始终在您手中 2、及其对应图片 # print(soup.select('div p[class="footertext"]')[0].get_text()) # print(soup.select('div img')[0].get('src')) # children=soup.select('div[class="xianxiao"]')[0].children # # print(children) # for child in children: # # print(child) # if child!='\n': # # 1.拿属性 # child_attrs=child.attrs # # print(child_attrs) # {'src': 'img/xianxiao.png'}{'class': ['footertext']} # if 'src' in child_attrs: # print(child_attrs['src']) # # if 'class' in child_attrs: # #2.拿文字 # print(child.get_text()) # test2 div_div=soup.select('div[class="pinzhibaozhang_center"] > div') for sub_div in div_div: div_children=sub_div.children # print(div_children) for child in div_children: if child!='\n': # 1.拿属性 child_attrs=child.attrs # print(child_attrs) if 'src' in child_attrs: print(child_attrs['src']) #2.拿文字 print(child.get_text().strip())
/Library/Frameworks/Python.framework/Versions/3.6/bin/python3.6 /Users/apple/PycharmProjects/stage4/spider/2018_3_8/04bs4.py
img/xianxiao.png
先消费后付款,主动权始终在您手中
img/tao.png
淘宝交易,安全放心
img/yijian.png
一键消费,中间不收取其他费用
img/che.png
免费车接车送;代订离岛船票
Process finished with exit code 0