# 正则表达式
import re
str1 = 'www.kaikeba.com'
result = re.match('www',str1)
# 获取匹配的结果
# print(result.group())
# 获取匹配字符串的长度范围
# print(result.span())
# 不在起始位置匹配,返回None
# print(re.match('kaikeba',str1))
# 默认大小写是敏感的
# print(re.match('WWW',str1))
# 设置匹配模式,忽略大小写
print(re.match('WWW',str1,re.I).group())
www
str2 = 'abc 123 def'
# 常规匹配
# print(re.match('^abc\s\d\d\d\sdef$',str2).group())
# print(re.match('^abc\s\d{3}\sdef$',str2).group())
# 获取指定的字符
print(re.match('^abc\s(.*)\sdef$',str2).group(1))
123
content= "hello 1234567 World Demo"
# 匹配出1234567
result= re.match('^hello.*(\d+).*Demo',content)
# print(result)
print(result.group(1))
# 这种情况的原因是前面的.* 给匹配掉了, .*在这里会尽可能的匹配多的内容,也就是我们所说的贪婪匹配
result= re.match('^hello.*?(\d+).*Demo',content)
# print(result)
print(result.group(1))
7
1234567
import re
html = '''<div id="songs-list">
<h2 class="title">经典老歌</h2>
<p class="introduction">
经典老歌列表
</p>
<ul id="list" class="list-group">
<li data-view="2">一路上有你</li>
<li data-view="7">
<a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
</li>
<li data-view="4" class="active">
<a href="/3.mp3" singer="齐秦">往事随风</a>
</li>
<li data-view="6"><a href="/4.mp3" singer="beyond">光辉岁月</a></li>
<li data-view="5"><a href="/5.mp3" singer="陈慧琳">记事本</a></li>
<li data-view="5">
<a href="/6.mp3" singer="邓丽君">但愿人长久</a>
</li>
</ul>
</div>'''
result = re.search('<li.*?active.*?singer="(.*?)">(.*?)</a>',html,re.S)
print(result)
print(result.groups())
print(result.group(1))
print(result.group(2))
<re.Match object; span=(153, 366), match='<li data-view="2">一路上有你</li>\n <li data-vi>
('齐秦', '往事随风')
齐秦
往事随风
# tieBaSpiser
import requests
import re
import csv
class TiebaSpider(object):
def __init__(self,tiebaName):
self.tibaName = tiebaName
self.base_url = 'https://tieba.baidu.com/f?kw='+tiebaName+'&ie=utf-8&pn={}'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}
# 构造请求链接
def get_url_list(self):
url_list = []
for i in range(1):
url_list.append(self.base_url.format(i*50))
return url_list
# 请求页面信息
def get_pageInfo(self,url):
response = requests.get(url=url,headers=self.headers)
return self.parse_pageInfo(response.content.decode('utf-8'))
# 解析界面
def parse_pageInfo(self,html):
pattern=re.compile('<li class=" j_thread_list clearfix".*?<a rel="noreferrer".*?href="(.*?)".*?title="(.*?)".*?</a>',re.S)
# print(re.findall(pattern,html))
return re.findall(pattern,html)
# 存储到txt中
def save_to_txt(self,info):
for tuple_value in info:
info_str = '贴子的信息:'+tuple_value[1] + '帖子的链接:https://tieba.baidu.com'+tuple_value[0]+'\n'
# print(info_str)
with open('./tiba.txt','a') as f:
f.write(info_str)
# 写入收到csv找那个
def save_to_csv(self,info):
for tuple_value in info:
# newline 默认会有'\r\n'
# encoding='utf-8-sig', 解决中文乱码
with open('./tiba.csv','a',newline='',encoding='utf-8-sig') as f:
writer = csv.writer(f)
writer.writerow(list(tuple_value))
# 所有的逻辑机构都放在这里面
def run(self):
# 1. 获取链接
url_list = self.get_url_list()
# 2. 发送请求
for url in url_list:
info = self.get_pageInfo(url)
# self.save_to_txt(info)
self.save_to_csv(info)
if __name__=="__main__":
tiebaspider = TiebaSpider('lol')
tiebaspider.run()
pass