import requests, re, time from lxml import etree from urllib import parse from mysql_test import mysql_connect def txzhaopin(num): for page in range(num): page*=10 mc = mysql_connect() sql = 'insert into txzhaopin (title, location, type, num, detail, request) values (%s,%s,%s,%s,%s,%s)' url = 'https://hr.tencent.com/position.php?keywords=&lid=0&tid=87&start=%d' % page headers = { 'Cookie': 'PHPSESSID=ipcbv7o54krhuhgf9e1cc5km07; pgv_pvi=5188443136; pgv_si=s6186220544', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3377.1 Safari/537.36', } response = requests.get(url, headers=headers) html_ele = etree.HTML(response.text) tr_ele = html_ele.xpath('//table[@class="tablelist"]/tr')[1:-1] for tr in tr_ele: href_ele = tr.xpath('./td/a/@href') sec_url = parse.urljoin(url, href_ele[0]) print(sec_url) res = requests.get(sec_url, headers=headers) html_ele = etree.HTML(res.text) sec_tr_ele = html_ele.xpath('//table[@class="tablelist textl"]/tr')[:4] title = sec_tr_ele[0].xpath('./td')[0].text print(title) location = sec_tr_ele[1].xpath('./td[1]/text()')[0] print(location) type = sec_tr_ele[1].xpath('./td[2]/text()')[0] print(type) num = re.search('\d+',sec_tr_ele[1].xpath('./td[3]/text()')[0]).group(0) print(num) detail = ''.join(sec_tr_ele[2].xpath('./td/ul/li/text()')) print(detail) request = ''.join(sec_tr_ele[3].xpath('./td/ul/li/text()')) print(request) data = (title, location, type, num, detail, request) mc.mysql_insert_modify(sql, data) time.sleep(1) if __name__ == '__main__': num = 3 txzhaopin(num)
我爱我家租房信息
最新推荐文章于 2021-10-22 17:35:14 发布