>︿< 入不了门 重新开始 今天很慌张,感觉大家都开始做了
通过正则表达式获取文本信息
- re.findall:返回string中所有与pattern相匹配的全部字串,返回形式为数组
- (.*?)
- 参数re.DOTALL:选取多行信息
from urllib.request import urlopen import re html = urlopen("https://morvanzhou.github.io/static/scraping/basic-structure.html").read().decode('utf-8') print(html) # urlopen("url")读取网页信息 #read().decode('utf-8')读取并解码网页 res = re.findall(r"(.+?)", html) print("\nPage title is: ", res[0]) #re.findall:返回string中所有与pattern相匹配的全部字串,返回形式为数组 #打印和中间的文本 #正则表达式:根据规则选取文本中的某段信息 res = re.findall(r"
(.*?)
", html, flags=re.DOTALL) print("\nPage paragraph is: ", res[0]) #re.DOTALL选取多行信息 res = re.findall(r'href="(.*?)"', html) print("\nAll links: ", res)bs4
####简化匹配过程选取tag信息(代替正则表达式
- pip install wheel
- pip lxml
from bs4 import BeautifulSoup from urllib.request import urlopen html = urlopen("https://morvanzhou.github.io/static/scraping/basic-structure.html").read().decode(‘utf-8’) print(html) soup = BeautifulSoup(html,features='lxml') #参数features传入解析形式 print(soup.h1) print('\n',soup.p) all_herf = soup.find_all('a') print(all_herf) #soup.find_all('a')表示找到所有的a tag all_herf = [l['href'] for l in all_href] ’‘’ 相当于 for l in all_href: print(l['href']) ‘’‘
使用tag:css的class(信息捆绑形式
- Class:通过批量采取某一种类型的信息
from bs4 import BeautifulSoup from urllib.request import urlopen html = urlopen("https://textweb").read().decode('utf-8') # decode()里面的解码形式需要引号括起来 soup = BeautifulSoup(html,features = 'lxml') #找到month类的值 month = soup.find_all('li',{"class":"month"}) for m in month: print(m.get_text()) #直接print(m)输出包括超链接在内的tag,m.get_text()则输出文本 #在
- 下找<> jan = soup.find('ul',{"class":"jan"}) d_jan = jan.find_all('li',{"class":"jan"}) for d in d_jan: print(d.get_text())