技能点:正则 requests tdqm BeautifulSoup4 PyQuery celery redis
1.requests 来获取html页面
2.tdqm 来查看程序完成进度条
3. beautifulsoup4 PyQuery 用来解析HTML页面提取有效信息
4.celery 用来异步发送邮箱
5.redis用来存储celery需要执行的函数
获取html和解析html代码如下:
#-*- coding:utf-8 -*- import re from urllib.parse import urlencode import requests import time from tqdm import tqdm from bs4 import BeautifulSoup from pyquery import PyQuery from celery_tasks.email import tasks as email def get_main_html(city,word,page): '''获取主页源代码''' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 'Host': 'sou.zhaopin.com', 'Referer': 'https://www.zhaopin.com/', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9' } data = { 'jl': city, # 搜索城市 'kw': word, # 搜索关键词 'isadv': 0, # 是否打开更详细搜索选项 'isfilter': 1, # 是否对结果过滤 'p': page # 页数 } url = 'https://sou.zhaopin.com/jobs/searchresult.ashx?'+ urlencode(data) response = requests.get(url,headers=headers)