目标:
http://www.51job.com/上的python相关招聘信息——>入门级别
import re
import urllib.request
class Grab(object):
# 定义类属性
num = 0
def __init__(self):
# 请求的网址
self.url = "http://search.51job.com/list/010000,000000,0000,00,9,99,python,2,1.html?"
# 请求头
self.headers = {"Host": "search.51job.com",
"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"
}
def openurl(self):
# 创建请求对象
fin_url = urllib.request.Request(url=self.url,headers=self.headers)
# 请求网址
read_data = urllib.request.urlopen(fin_url)
# 读取内容
data = read_data.read().decode("gbk")
# 正则匹配字符,取出存放信息的链接
mes_list = re.findall(r"http://jobs\.51job\.com.+\.html", data)
# 遍历存放链接的列表
for i in mes_list:
self.deal(i)
def deal(self,url):
# 请求取出的网址
files = urllib.request.urlopen(url)
# 读取网站
data = files.read()
# 正则匹配内容
find_list = re.findall(r"<p>.*</p>",data.decode("gbk"))
# 打开文件写入
new_file = open("zhaopin"+"/"+str(Grab.num)+".txt","w")
new_file.write(str(find_list))
new_file.close()
Grab.num += 1
def main():
# 创建对象
g = Grab()
g.openurl()
if __name__ == "__main__":
main()