腾讯:
import requests,re import requests import json import pymysql class mysql_conn(object): # 魔术方法, 初始化, 构造函数 def __init__(self): self.db = pymysql.connect(host='127.0.0.1',user = 'root',password = '123456',port = 3306,database='py11_mysql') self.cursor = self.db.cursor() # 执行modify(修改)相关的操作 def execute_modify_mysql(self, sql): self.cursor.execute(sql) self.db.commit() # 魔术方法, 析构化 ,析构函数 def __del__(self): self.cursor.close() self.db.close() def tx(sql): page = '' p_page = 0 i = 0 while i <= sql: url = 'https://hr.tencent.com/position.php?keywords=&lid=0'+page+'' print(url) headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36' } response = requests.get(url,headers = headers) html_bytes = response.text parttern = '<a target="_blank" href="(.*?)">' info_url = re.findall(parttern,html_bytes) # print(info_url) for info_url_list in info_url: info_url_all = 'https://hr.tencent.com/'+info_url_list # print(info_url_all) response = requests.get(info_url_all,headers = headers) # print(response.text) info_html = response.text title_parttern = '<title>(.*?)</title>' title = re.search(title_parttern,info_html).group(1) place_parttern = '工作地点:</span>(.*?)</td>' place = re.search(place_parttern,info_html).group(1) job_list_parttern = '职位类别:</span>(.*?)</td>' job_list = re.search(job_list_parttern,info_html).group(1) job_duty_parttern = '工作职责:</div>\s.*<ul class="squareli"><li>(.*?)</li></ul>' job_duty = re.search(job_duty_parttern, info_html).group(1) job_ask_parttern = '工作要求:</div>\s.*<ul class="squareli"><li>(.*?)</li></ul>' job_ask = re.search(job_ask_parttern, info_html).group(1) data = {} data['title'] = title data['place'] = place data['job_list'] = job_list data['job_duty'] = job_duty data['job_ask'] = job_ask my = mysql_conn() my.execute_modify_mysql('insert into tx(title,place,job_list,job_duty,job_ask) values ("{title}","{place}","{job_list}","{job_duty}","{job_ask}")'.format(**data)) i += 1 p_page += 10 page = '&start='+str(p_page)+'#a' print(page) tx(10)
头条:
import re import json import requests import os from urllib import request url = 'https://www.toutiao.com/a6331698802248909057/' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36' } response = requests.get(url,headers = headers) html_str = response.text # print(response.text) pattern = r'gallery: JSON\.parse\((.*)\),' res_lists = re.search(pattern,html_str) # print(res_lists.group(1)) path = 'D:/Python/download' # if not os.path.exists('download'): # os.makedirs('download') # print(os.path) os.chdir(path) if res_lists: img_lists = json.loads(res_lists.group(1)) img_dict = json.loads(img_lists) # print(img_dict['sub_images']) for img in img_dict['sub_images']: img_url = img['url'] filename = img_url.split('/')[-1]+'.jpg' request.urlretrieve(img_url,filename) else: pass