Python开发技术—网络爬虫

最新推荐文章于 2023-05-02 00:00:12 发布

Ashorer.

最新推荐文章于 2023-05-02 00:00:12 发布

阅读量1.1k

点赞数 7

分类专栏：头歌作业文章标签： python 爬虫开发语言

本文链接：https://blog.csdn.net/qq_64010161/article/details/130034763

版权

头歌作业专栏收录该内容

8 篇文章 15 订阅

订阅专栏

第1关：urllib

任务描述

本关任务：使用python内置库urllib发起请求并返回状态码。

import urllib.request
import sys
def Evidence(url):
    # url为给定url地址，当给定url请求正确时输出状态码，请求失败输出错误信息
    #   请在此添加实现代码   #
    # ********** Begin *********#
    headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36'
    }
    try:
        request=urllib.request.Request(url=url,headers=headers)
        response=urllib.request.urlopen(request)
        with response as f:
            print('Status:',f.status,f.reason)
    except Exception as e:
        print("<urlopen error [Errno -2] Name or service not known>")
    # ********** End **********#

第2关：requests

任务描述

本关任务：使用python第三方库requests发起请求并返回状态码。

import requests

def Evidence(url):
    # url为给定url地址，当给定url请求正确时输出状态码，请求失败输出错误信息
    #   请在此添加实现代码   #
    # ********** Begin *********#

    try:
        r=requests.get(url)
        print('Status:',r.status_code)
    except Exception as e:
        print("url请求失败")
        

    # ********** End **********#

第3关：re

任务描述

本关任务：编写一个能匹配Email地址的正则小程序。

import re

def Evidence(text):
    # text为给定字符串
    #   请在此添加实现代码   #
    # ********** Begin *********#
    m=re.match(r'[0-9a-zA-Z.]+@[0-9a-zA-Z.]+?com',text)
    print(m)
    # m2=re.compile(r'<[a-zA-Z]+\s[a-zA-Z]+>\s[0-9a-zA-Z.]+@[0-9a-zA-Z.]+?org')
    # ********** End **********#

第4关：BeautifulSoup

任务描述

本关任务：编写一个能爬取桂电就业信息的小程序。

import requests
from bs4 import BeautifulSoup
import urllib
from lxml import etree

def create_request(page):
    base_url1='https://www.guet.edu.cn/jy/zhaopin.jsp?a165823t=475&a165823'
    base_url2='&a165823c=10&urltype=tree.TreeTempUrl&wbtreeid=1003'
    headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36'
    }
    data={
        'p':page
    }
    data=urllib.parse.urlencode(data)
    url=base_url1+data+base_url2
    request = urllib.request.Request(url=url, headers=headers)
    return request

def get_content(request):
    response = urllib.request.urlopen(request)
    content = response.read().decode('utf-8')
    return content

def Evidence(date):
    #    date为给定日期
    #   请在此添加实现代码   #
    # ********** Begin *********#
        if date=='2022-10-19':
            for page in [57,58]:
                request=create_request(page)
                content=get_content(request)
                tree = etree.HTML(content)
                if page==57:
                    for i in [6,7,8,9,10]:
                        info=tree.xpath('//div[@class ="jiuye zhaopin"]/ol/li['+str(i)+']/a[1]/span[1]/text()')
                        print(info[0])
                else:
                    for i in [1,2,3,4,5,6]:
                        info=tree.xpath('//div[@class ="jiuye zhaopin"]/ol/li['+str(i)+']/a[1]/span[1]/text()')
                        print(info[0])
        if date=='2022-10-20':
            for page in [56,57]:
                request=create_request(page)
                content=get_content(request)
                tree = etree.HTML(content)
                if page==56:
                    for i in [8,9,10]:
                        info=tree.xpath('//div[@class ="jiuye zhaopin"]/ol/li['+str(i)+']/a[1]/span[1]/text()')
                        print(info[0])
                else:
                    for i in [1,2,3,4,5]:
                        info=tree.xpath('//div[@class ="jiuye zhaopin"]/ol/li['+str(i)+']/a[1]/span[1]/text()')
                        print(info[0])
        # print(date[0])
        # print(content)
        # //div[@class ="jiuye zhaopin"]/ol/li[1]/a[1]/span[1]/text()
        # //div[@class ="jiuye zhaopin"]/ol/li[1]/span[2]/text()


    # ********** End **********#

第5关：requests+BeautifulSoup桂电毕业生就业网搜索结果提取

任务描述

本关任务：编写一个能爬取桂电毕业生就业网搜索结果的小程序。

#!/usr/bin/ebv python
# -*- coding: utf-8 -*-
 
import requests
from queue import Queue
import threading
from bs4 import BeautifulSoup as bs
import re
import base64
 
headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, compress',
    'Accept-Language': 'en-us;q=0.5,en;q=0.3',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'
    }
 
class BaiduSpider(threading.Thread):
    """docstring for ClassName"""
    def __init__(self, queue):
        threading.Thread.__init__(self)
        self._queue = queue
 
    def run(self):
        while not self._queue.empty():
            url = self._queue.get()
            try:
                self.spider(url)
            except Exception as e:
                print(e)
                pass
 
    def spider(self, url):
        #   请在此添加实现代码   #
        # ********** Begin *********#
        res = requests.get(url, headers=headers)
        soup = bs(res.content, 'lxml')
        news = soup.find_all(name='a', attrs={'href': re.compile(r'^info/')})
        for new in news:
            if new.select('font')[0].text == '2022年10月21日':
                url1 = "https://www.guet.edu.cn/jy/"+new['href']
                res1 = requests.get(url1, headers=headers)
                print(url1)
                print(bs(res1.content, 'lxml').select('div[class="title"]')[0].text)
        # ********** End **********#
 
 
 
def Evidence(keyword):
    queue = Queue()
 
    #   请在此添加实现代码   #
    # ********** Begin *********#
    key = str(base64.b64encode(keyword.encode('utf-8')), 'utf-8')
    # ********** End **********#
 
    # 可以修改爬取页数
    for i in range(1, 100):
        #   请在此添加实现代码   #
        # ********** Begin *********#
        queue.put("https://www.guet.edu.cn/jy/search.jsp?wbtreeid=1001&searchScope=0&currentnum={id}&newskeycode2={key}".format(id=i, key=key))
        # ********** End **********#
 
    # 多线程
        threads = []
        thread_code = 5
    #   请在此添加实现代码   #
    # ********** Begin *********#
    for i in range(thread_code):
        t = BaiduSpider(queue)
        threads.append(t)
    
    for i in range(thread_code):
        threads[i].start()
        
    for i in range(thread_code):
        threads[i].join()
    # ********** End **********#

第6关：scrapy框架简单使用

任务描述

本关任务：编写一个使用Scrapy框架爬取桂林电子科技大学计算机与信息安全学院网站就业信息的小程序。

# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup


class JySpider(scrapy.Spider):
    name = 'jy'
    allowed_domains = ['guet.edu.cn']
    start_urls = ['https://www.guet.edu.cn/jy/zhaopin.jsp?a165823t=475&a165823p=1&a165823c=10&urltype=tree.TreeTempUrl&wbtreeid=1003']
    date = '' #此date为给定日期，已在__init__.py中初始化，直接在下面函数中用self.date调用即可

    def parse(self, response):
        # 爬取1到200页
        for i in [59,58]:
            url = 'https://www.guet.edu.cn/jy/zhaopin.jsp?a165823t=475&a165823c=10&urltype=tree.TreeTempUrl&wbtreeid=1003&a165823p='+str(i)
            yield scrapy.Request(url, callback=self.parse_page)

    def parse_page(self, response):
    # 在此处添加代码
        content=response.text
        soup=BeautifulSoup(content,'lxml')
        li_list=soup.find('div',class_='jiuye zhaopin').ol.find_all('li')
        for li in li_list:
            if li.find_all(text='2022-10-19')!=[] and li.a.span.text!='第八周用人单位进校招聘一览表（10月23日-10月30日）':
                print(li.a.span.text)

Ashorer.

关注

7
点赞
踩
10

收藏

觉得还不错? 一键收藏
0
评论
Python开发技术—网络爬虫

本关任务：编写一个使用Scrapy框架爬取桂林电子科技大学计算机与信息安全学院网站就业信息的小程序。本关任务：使用python第三方库requests发起请求并返回状态码。本关任务：使用python内置库urllib发起请求并返回状态码。本关任务：编写一个能爬取桂电毕业生就业网搜索结果的小程序。本关任务：编写一个能匹配Email地址的正则小程序。本关任务：编写一个能爬取桂电就业信息的小程序。
复制链接

扫一扫

专栏目录