为了更快捷,使用多线程爬取
import requests
from lxml import etree
import threading
def get_request(page):
url = "https://search.51job.com/list/190200,000000,0000,00,9,99,Python,2,"+str(page)+".html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare="
headers = {
"User-Agent":"头部信息"
}
res = requests.get(url,headers=headers)
res.encoding = res.apparent_encoding
return res.text
def get_content(respon):
html = etree.HTML(respon).xpath('//div[@class="dw_table"]//div[@class="el"]')
print(html)
for value in html:
try:
position = value.xpath('./p/span/a/text()')[0].strip()
link = value.xpath('./p/span/a/@href')[0]
company = value.xpath('./span/a/text()')[0]
address = value.xpath('./span[@class="t3"]/text()')[0]
wage = value.xpath('./span[@class="t4"]/text()')
updataTime = value.xpath('./span[@class="t5"]/text()')[0]
if len(wage)<1:
wage = "null"
else:
wage = wage[0]
print("职位链接:",link)
print("职位名:",position)
print("公司名:",company)
print("工作地点:",address)
print("工资:",wage)
print("发布时间:",updataTime)
print("-"*30)
except Exception as e:
print("发生错误")
print(e)
def main(page):
print("第",page,"页",end="")
try:
respon = get_request(page)
get_content(respon)
print("爬取成功")
except Exception as e:
print("爬取失败")
print(e)
def run1():
for page in range(1,12,2):
main(page)
def run2():
for page in range(2,12,2):
main(page)
if __name__ == '__main__':
t1 = threading.Thread(target=run1,args=())
t2 = threading.Thread(target=run2,args=())
t1.start()
t2.start()
t1.join()
t2.join()