第1关:urllib
任务描述
本关任务:使用python内置库urllib发起请求并返回状态码。
import urllib.request
import sys
def Evidence(url):
# url为给定url地址,当给定url请求正确时输出状态码,请求失败输出错误信息
# 请在此添加实现代码 #
# ********** Begin *********#
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36'
}
try:
request=urllib.request.Request(url=url,headers=headers)
response=urllib.request.urlopen(request)
with response as f:
print('Status:',f.status,f.reason)
except Exception as e:
print("<urlopen error [Errno -2] Name or service not known>")
# ********** End **********#
第2关:requests
任务描述
本关任务:使用python第三方库requests发起请求并返回状态码。
import requests
def Evidence(url):
# url为给定url地址,当给定url请求正确时输出状态码,请求失败输出错误信息
# 请在此添加实现代码 #
# ********** Begin *********#
try:
r=requests.get(url)
print('Status:',r.status_code)
except Exception as e:
print("url请求失败")
# ********** End **********#
第3关:re
任务描述
本关任务:编写一个能匹配Email地址的正则小程序。
import re
def Evidence(text):
# text为给定字符串
# 请在此添加实现代码 #
# ********** Begin *********#
m=re.match(r'[0-9a-zA-Z.]+@[0-9a-zA-Z.]+?com',text)
print(m)
# m2=re.compile(r'<[a-zA-Z]+\s[a-zA-Z]+>\s[0-9a-zA-Z.]+@[0-9a-zA-Z.]+?org')
# ********** End **********#
第4关:BeautifulSoup
任务描述
本关任务:编写一个能爬取桂电就业信息的小程序。
import requests
from bs4 import BeautifulSoup
import urllib
from lxml import etree
def create_request(page):
base_url1='https://www.guet.edu.cn/jy/zhaopin.jsp?a165823t=475&a165823'
base_url2='&a165823c=10&urltype=tree.TreeTempUrl&wbtreeid=1003'
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36'
}
data={
'p':page
}
data=urllib.parse.urlencode(data)
url=base_url1+data+base_url2
request = urllib.request.Request(url=url, headers=headers)
return request
def get_content(request):
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
return content
def Evidence(date):
# date为给定日期
# 请在此添加实现代码 #
# ********** Begin *********#
if date=='2022-10-19':
for page in [57,58]:
request=create_request(page)
content=get_content(request)
tree = etree.HTML(content)
if page==57:
for i in [6,7,8,9,10]:
info=tree.xpath('//div[@class ="jiuye zhaopin"]/ol/li['+str(i)+']/a[1]/span[1]/text()')
print(info[0])
else:
for i in [1,2,3,4,5,6]:
info=tree.xpath('//div[@class ="jiuye zhaopin"]/ol/li['+str(i)+']/a[1]/span[1]/text()')
print(info[0])
if date=='2022-10-20':
for page in [56,57]:
request=create_request(page)
content=get_content(request)
tree = etree.HTML(content)
if page==56:
for i in [8,9,10]:
info=tree.xpath('//div[@class ="jiuye zhaopin"]/ol/li['+str(i)+']/a[1]/span[1]/text()')
print(info[0])
else:
for i in [1,2,3,4,5]:
info=tree.xpath('//div[@class ="jiuye zhaopin"]/ol/li['+str(i)+']/a[1]/span[1]/text()')
print(info[0])
# print(date[0])
# print(content)
# //div[@class ="jiuye zhaopin"]/ol/li[1]/a[1]/span[1]/text()
# //div[@class ="jiuye zhaopin"]/ol/li[1]/span[2]/text()
# ********** End **********#
第5关:requests+BeautifulSoup桂电毕业生就业网搜索结果提取
任务描述
本关任务:编写一个能爬取桂电毕业生就业网搜索结果的小程序。
#!/usr/bin/ebv python
# -*- coding: utf-8 -*-
import requests
from queue import Queue
import threading
from bs4 import BeautifulSoup as bs
import re
import base64
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, compress',
'Accept-Language': 'en-us;q=0.5,en;q=0.3',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'
}
class BaiduSpider(threading.Thread):
"""docstring for ClassName"""
def __init__(self, queue):
threading.Thread.__init__(self)
self._queue = queue
def run(self):
while not self._queue.empty():
url = self._queue.get()
try:
self.spider(url)
except Exception as e:
print(e)
pass
def spider(self, url):
# 请在此添加实现代码 #
# ********** Begin *********#
res = requests.get(url, headers=headers)
soup = bs(res.content, 'lxml')
news = soup.find_all(name='a', attrs={'href': re.compile(r'^info/')})
for new in news:
if new.select('font')[0].text == '2022年10月21日':
url1 = "https://www.guet.edu.cn/jy/"+new['href']
res1 = requests.get(url1, headers=headers)
print(url1)
print(bs(res1.content, 'lxml').select('div[class="title"]')[0].text)
# ********** End **********#
def Evidence(keyword):
queue = Queue()
# 请在此添加实现代码 #
# ********** Begin *********#
key = str(base64.b64encode(keyword.encode('utf-8')), 'utf-8')
# ********** End **********#
# 可以修改爬取页数
for i in range(1, 100):
# 请在此添加实现代码 #
# ********** Begin *********#
queue.put("https://www.guet.edu.cn/jy/search.jsp?wbtreeid=1001&searchScope=0¤tnum={id}&newskeycode2={key}".format(id=i, key=key))
# ********** End **********#
# 多线程
threads = []
thread_code = 5
# 请在此添加实现代码 #
# ********** Begin *********#
for i in range(thread_code):
t = BaiduSpider(queue)
threads.append(t)
for i in range(thread_code):
threads[i].start()
for i in range(thread_code):
threads[i].join()
# ********** End **********#
第6关:scrapy框架简单使用
任务描述
本关任务:编写一个使用Scrapy框架爬取桂林电子科技大学计算机与信息安全学院网站就业信息的小程序。
# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup
class JySpider(scrapy.Spider):
name = 'jy'
allowed_domains = ['guet.edu.cn']
start_urls = ['https://www.guet.edu.cn/jy/zhaopin.jsp?a165823t=475&a165823p=1&a165823c=10&urltype=tree.TreeTempUrl&wbtreeid=1003']
date = '' #此date为给定日期,已在__init__.py中初始化,直接在下面函数中用self.date调用即可
def parse(self, response):
# 爬取1到200页
for i in [59,58]:
url = 'https://www.guet.edu.cn/jy/zhaopin.jsp?a165823t=475&a165823c=10&urltype=tree.TreeTempUrl&wbtreeid=1003&a165823p='+str(i)
yield scrapy.Request(url, callback=self.parse_page)
def parse_page(self, response):
# 在此处添加代码
content=response.text
soup=BeautifulSoup(content,'lxml')
li_list=soup.find('div',class_='jiuye zhaopin').ol.find_all('li')
for li in li_list:
if li.find_all(text='2022-10-19')!=[] and li.a.span.text!='第八周用人单位进校招聘一览表(10月23日-10月30日)':
print(li.a.span.text)