Python经典基础习题(网络爬虫)

1.批量爬取yuan士信息,把每位yuan士的文字介绍保存到该yuan士名字为名的记事本文件中,照片保存到该院士名字为名的jpg文件中。

程序代码:

from urllib.request import urlopen
from re import findall
import requests
url = 'https://www.cae.cn/cae/html/main/col48/column_48_1.html'
with urlopen(url) as file:
    content = file.read().decode()
pattern = 'href="(.+?)"'
result = findall(pattern,content)
end = []
for urls in result:
    if urls[-4:-1] == "htm":
        if urls[15:20] == "colys":
            end.append('https://www.cae.cn'+urls)
pattern_jpg = 'img src="(.+?)" style="width:150px;height:210px;"'
pattern_message = '<p>&ensp;&ensp;&ensp;&ensp;(.+?)</p><p>&nbsp;</p><p>&ensp;&ensp;&ensp;&ensp;(.+?)</p>'
pattern_name = '<div class="right_md_name">(.+?)</div>'
for urls1 in end:
    with urlopen(urls1) as file:
        content1 = file.read().decode()
        result_jpg = findall(pattern_jpg,content1)
        result_message = findall(pattern_message,content1)
        result_name = findall(pattern_name,content1)
        for a in result_jpg:
            result_jpg_str = "".join(a)
        for b in result_message:
            result_message_str = "".join(b)
        for c in result_name:
            result_name_str = "".join(c)
        with open(result_name_str+'.txt','w',encoding="utf-8") as file_message:
            file_message.write(result_message_str)
            file_message.close()
        with open(result_name_str+'.jpg','wb') as file_jpg:
            url_new ="https://www.cae.cn/"+result_jpg_str
            r = requests.get(url_new)
            file_jpg.write(r.content)
            file_jpg.close()


2.根据院士名单,爬取该yuan士性别,族别信息;根据yuan士简介提取该院士就读本科学校,入选院士年份;将院士姓名,性别,族别信息,本科学校,入选yuan士年份信息写入excel文件。

程序代码:

from urllib.request import urlopen
from re import findall
import  openpyxl
from openpyxl import Workbook
fn = r'D:\message.xlsx'
wb = Workbook()
ws = wb.create_sheet(title="工程院士信息")
ws['A1'] = '姓名'
ws['B1'] = '性别'
ws['C1'] = '民族'
ws['D1'] = '毕业院校'
ws['E1'] = '入选年份'
wb.save(fn)
wb = openpyxl.load_workbook(fn)
ws = wb.worksheets[1]
url = 'https://www.cae.cn/cae/html/main/col48/column_48_1.html'
with urlopen(url) as file:
    content = file.read().decode()
pattern = 'href="(.+?)"'
result = findall(pattern,content)
end = []
for urls in result:
    if urls[-4:-1] == "htm":
        if urls[15:20] == "colys":
            end.append('https://www.cae.cn'+urls)
pattern_message_year = '<p>&ensp;&ensp;&ensp;&ensp;(.+?)</p><p>&nbsp;</p><p>&ensp;&ensp;&ensp;&ensp;(.+?)</p>' \
                  '(<p>&nbsp;</p><p>&ensp;&ensp;&ensp;&ensp;(.+?)</p>)*'
pattern_message = '<p>&ensp;&ensp;&ensp;&ensp;(.+?)</p><p>&nbsp;</p><p>&ensp;&ensp;&ensp;&ensp;(.+?)</p>'
pattern_next_url = '<a href="(.+?)" target="_blank">'
count = 1
number = 2
compare = []
count_year = 0
number_year = 2
count_number = 2
for url_new in end:
    if number == 41:
        break
    with urlopen(url_new) as file:
        content1 = file.read().decode()
        result_new_url = findall(pattern_next_url, content1)
    with urlopen(result_new_url[0]) as file_enter:
        file_enter_url = file_enter.read().decode()
        pattern_name = '(<span>:</span><h4>|<span>:</span>)(.+?)(</h4></div>|</div>)'
        message = findall(pattern_name, file_enter_url)
        ws.cell(row=count_number, column=1, value=message[0][1])
        ws.cell(row=count_number, column=3, value=message[1][1])
        ws.cell(row=count_number, column=2, value=message[2][1])
        count_number += 1
        wb.save(fn)
        result_message = findall(pattern_message,content1)
        result_message_year = findall(pattern_message_year,content1)
        number += 1
        for i in result_message:
            pattern_study = '毕业于(.+?)大学'
            for j in i:
                study = findall(pattern_study,j)
                for end in study:
                    endd = end + "大学"
                    if count == 40:
                        break
                    if len(endd) in range(4,10):
                        count += 1
                        ws.cell(row=count,column=4,value=endd)
                        wb.save(fn)
                    else:
                        count += 1
        for year in result_message_year[0]:
            pattern_enter = '\d{4}年当选'
            enter = findall(pattern_enter, year)
            enter_year = "".join(enter)
            if len(enter_year) != 0:
                compare.append(enter_year)
                count_year += 1
            if count_year == 2:
                if compare[0] == compare [1]:
                    ws.cell(row=number_year, column=5, value=compare[0][:-3])
                    wb.save(fn)
                    number_year += 1
                    compare = []
                    count_year = 0
                else:
                    ws.cell(row=number_year, column=5, value=compare[0][:-3])
                    number_year += 1
                    wb.save(fn)
                    del compare[0]
                    count_year -= 1
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值