环境准备
搭建Python环境
略
安装requests:
进入Python安装目录下的Scripts目录(确保pip.exe文件存在,正常情况下存在)
进入cmd终端并进入该目录,输入命令"pip install requests"
安装re(可以省略)
输入pip install re
爬取珍爱网信息代码
import requests
import re
class Zhenghun(object):
url = "http://www.zhenai.com/zhenghun/"
# 爬取类型为指定城市的女性
def __init__(self,city):
self.newurl = Zhenghun.url + city + "/nv"
# 获取html内容
def getHtml(self):
return requests.get(self.newurl).content.decode("utf-8")
# 通过正则表达式过滤html中的内容并返回数组
def parse(self):
html = self.getHtml()
rex = '<a href="http://album.zhenai.com/u/[0-9]+"[^>]*[^<]+</a>'
list = re.findall(rex,html)
return list
def main():
# 设定城市
citylist = ["dongcheng","chaoyang1","changping"]
# 按城市遍历
for city in citylist:
zhenghun = Zhenghun(city)
list = zhenghun.parse()
# 按解析遍历
for l in list:
# 切片
u = l.split('"')
#print(u[1])
n = l[l.rfind('"')+2:l.rfind("<")]
# 打印姓名和页面链接
print(n+"\t\t\t"+u[1])
#防止测试代码被调用
if __name__=="__main__":
main()
爬取结果(已遮挡网址,侵删)
爬取Discuz论坛发帖和回帖代码
代码尚未优化
import requests
import re
# 爬取Discuz发帖和回帖内容
class Discuz(object):
# 用于存取主题链接的集合
list2 = set()
# 初始页面
url = "https://www.discuz.net/forum-plugin-1.html"
def __init__(self):
self.firstUrl = Discuz.url
# 获取初始页面html内容
def getFirstHtml(self):
return requests.get(self.firstUrl).content.decode("gbk")
# 获取跳转路径
def getPath(self):
html = self.getFirstHtml()
# 正则匹配
secondUrl = '<a href="thread-[0-9]*-1-1.html" onclick'
list = re.findall(secondUrl, html)
return list
# 拼接跳转路径并将路径储存在list3集合
def getSecondHtml(self):
discuz = Discuz()
list = discuz.getPath()
list3 = discuz.list2
for l in list:
thirdUrl = l.split('"')
list3.add("https://www.discuz.net/" + thirdUrl[1])
return list3
# 爬取发帖回帖内容
def getThirdHtml(self, Urls):
# 遍历帖子链接
for finalUrl in Urls:
finalHtml = requests.get(finalUrl).content.decode("gbk")
regular = '999">[\s\S]*?</td>'
# 获取类容数组
contentList = re.findall(regular, finalHtml)
# 遍历链接中帖子内容
for content in contentList:
# 过滤标签/空格/换行
sp = re.sub('<(.*?)>', '', content, 0).replace(" ", "").replace("\n", "")
# 切片
sp1 = sp.split('99">')
# 将切片结果输出
word = sp1[1]
print(word)
def main():
discuz = Discuz()
getUrl = discuz.getSecondHtml()
# for i in getUrl:
# print(i)
discuz.getThirdHtml(getUrl)
# print(discuz.list2)
if __name__ == "__main__":
main()
爬取结果
由于该网站对于匹配发帖和回帖内容的标签有多种,导致爬取结果不全,可以使用多种正则匹配方法匹配信息