python第七次作业

最新推荐文章于 2022-05-25 23:56:09 发布

LJRZWW

最新推荐文章于 2022-05-25 23:56:09 发布

阅读量120

点赞数

分类专栏：基础文章标签： homework7

本文链接：https://blog.csdn.net/qq_43077227/article/details/100043748

版权

基础专栏收录该内容

17 篇文章 0 订阅

订阅专栏

爬取清华大学的网站

import requests
import threading
import re
class university(threading.Thread):
    def __init__(self, start_, end_,lock):
        threading.Thread.__init__(self)
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
        }
        for i in range(1,601):
            self. base_url = "https://www.tsinghua.edu.cn/publish/thu2018/index"+str(i)+".html"
            self.start_ = start_
            self.end_ = end_
            self.lock = lock
    def run(self):
        for offset in range(self.start_, self.end_, 10):
            url = self.base_url % offset
            response = requests.get(url, headers=self.headers)
            html = response.text
            info_list = self.get_Information(html)
            with self.lock:
                self.write(info_list)
            print('offset {} OK !'.format(offset))
    def get_Information(self, html):
        information_list = []
        for line in html.split('\n'):
            if 'class="image-link"' in line:
                movie_name = line.split('title="')[1].split('"')[0]
                information_list.append(movie_name)
            if 'class="integer"' in line:
                res = re.search(
                    '<p class="score"><i class="integer">(\d\.)</i><i class="fraction">(\d)</i></p>',
                    line)
                integer = res.group(1)
                fraction = res.group(2)
                score = integer + fraction
                information_list.append(score)
        return information_list
    def write(self,info_list):
        str_ = str(info_list) + '\n'
        with open('res.txt',mode='a',encoding='utf8') as file:
            file.write(str_)
if __name__ == "__main__":
    threads = []
    lock = threading.Lock()
    for i in range(4):
        t = university(start_=i * 50, end_=(i + 1) * 50,lock=lock)
        t.start()
        threads.append(t)
    for t in threads:
        t.join()
    print('Over')

LJRZWW

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python第七次作业

爬取清华大学的网站import requestsimport threadingimport reclass university(threading.Thread): def __init__(self, start_, end_,lock): threading.Thread.__init__(self) self.headers = { ...
复制链接

扫一扫