1正则表达式(regular expression)re
re.sub(“a”,“b”,str):把str里的a换成b
2贪婪与非贪婪
.*:贪婪
.*?:非贪婪
3.获得网页源码
urllib.request.urlretrieve
4
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p >
<p class="story">Once upon a time there were three little sisters; and their names were
<a href=" " class="sister" id="link1"><!-- Elsie --></a >,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a > and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a >;
and they lived at the bottom of a well.</p >
<p class="story">...</p >
"""
# 转换类型
soup = BeautifulSoup(html, "lxml")
# find -->只找一个复合条件
p = soup.find("p")
p = soup.find(attrs={"class": "title"})
p = soup.find(text="...")
p = soup.find(re.compile("^b"))
# findall-->列表 全局搜索
p = soup.find_all("p")
# print len(p)
# select-->列表 全局搜索 CSS 选择器
# ID
# 标签
# 类
# 层级选择器
# 并集选择器
# 属性选择器
a = soup.select("#link2")
a = soup.select("a")
a = soup.select(".sister")
a = soup.select("p #link2")
a = soup.select("title,a")
p = soup.select('p[class="story"]')[1]
# 获取标签包裹的内容
p_content = p.get_text()
# 获取属性:默认是列表
p_class = p.get("class")
print(p_class[0])
字符串操作
##不推荐
colors = ['red', 'blue', 'green', 'yellow']
result = ''
for s in colors:
result += s # 每次赋值都丢弃以前的字符串对象, 生成一个新对象
##推荐
colors = ['red', 'blue', 'green', 'yellow']
result = ''.join(colors) # 没有额外的内存分配
下载百分号,堆糖项目里学的
import urllib
def callbackfunc(blocknum, blocksize, totalsize):
‘’‘回调函数
@blocknum: 已经下载的数据块
@blocksize: 数据块的大小
@totalsize: 远程文件的大小
‘’’
percent = 100.0 * blocknum * blocksize / totalsize
if percent > 100:
percent = 100
print “%.2f%%”% percent
url = ‘http://www.sina.com.cn’
local = ‘d:\sina.html’
urllib.urlretrieve(url, local, callbackfunc)
chinese = re.findall(’[\u4e00-\u9fa5]’,i) #汉字的范围为"\u4e00-\u9fa5"