【一天一个Python小案例】网页表格抓取

【一天一个Python小案例】网页表格抓取

import requests, json, os, csv
from bs4 import BeautifulSoup
import numpy as np


def new_prop(prop_info):
    print(" new proposal ".center(20, "-"))
    print(" - num: %s\n"
          " - Source Name: %s\n"
          " - Proposal ID: %s\n"
          " - Target ID: %s\n"
          " - Number of Obs.: %s\n"
          " - Obs. Time(ks): %s" % (
              prop_info[0], prop_info[1], prop_info[4], prop_info[5], prop_info[7], prop_info[8]))


def new_obs(prop_info, old_prop):
    print(" new observation ".center(20, "-"))
    print(" - num: %s\n"
          " - Source Name: %s\n"
          " - Proposal ID: %s\n"
          " - Target ID: %s\n"
          " - Number of Obs.: %s -> %s\n"
          " - Obs. Time(ks): %s -> %s" % (
              prop_info[0], prop_info[1], prop_info[4], prop_info[5], old_prop[7], prop_info[7], old_prop[8],
              prop_info[8]))


if __name__ == '__main__':
    if os.path.isfile("hxmt_obs.json"):
        first_run = False
        print("Searching new observations...")
        with open("hxmt_obs.json", "r") as infile:
            obs_list = json.load(infile)
    else:
        first_run = True
        print("Saving HXMT observation table to hxmt_obs.json...")
        obs_list = {}
    hxmt = "http://www.hxmt.org"
    url0 = "http://www.hxmt.org/ObsSrcList.jhtml"
    rsp = requests.get(url0)
    soup = BeautifulSoup(rsp.text, 'html.parser')
    lists = soup.select("div.hy_hxmt_title > a")
    n_new_table = 0
    n_new_prop_all = 0
    summery = ""
    for ll in lists:
        href = ll.get("href")
        if href in obs_list.keys():
            new_table = False
            print(ll.get_text())
            prop_list = np.array(obs_list[href])[:, 4]
            obj_list = np.array(obs_list[href])[:, 5]
        else:
            new_table = True
            print("Adding new table %s..." % ll.get_text())
            obs_list[href] = []
            prop_list = []
            obj_list = []
        url_list = hxmt + href
        table_name = ll.get_text()
        table_name = table_name[table_name.index("AO"):table_name.index("AO")+4]
        rsp = requests.get(url_list)
        soup = BeautifulSoup(rsp.text, 'html.parser')
        obs_table = soup.select("table")[0]
        obs_rows = obs_table.find_all(style=";height:19px")
        n_new_prop = 0
        # if first_run or new_table:
        #     for row in obs_rows:
        #         obs_list[href].append([])
        #         for child in row.children:
        #             text = child.get_text()
        #             text = text.replace("\u00a0","")
        #             obs_list[href][-1].append(text)
        #         if len(obs_list[href][-1]) == 8:
        #             obs_list[href][-1].append('')
        #             for n in [8, 7, 6, 5]:
        #                 obs_list[href][-1][n] = obs_list[href][-1][n - 1]
        #             obs_list[href][-1][4] = 'None'
        #         if obs_list[href][-1][-1] == '':
        #             obs_list[href][-1][-1] = 'None'
        # else:
        for row in obs_rows:
            row_info = []
            for child in row.children:
                text = child.get_text()
                text = text.replace("\u00a0", "")
                row_info.append(text)
            if len(row_info) == 8:
                row_info.append('')
                for n in [8, 7, 6, 5]:
                    row_info[n] = row_info[n - 1]
                row_info[4] = 'None'
            if row_info[-1] == '':
                row_info[-1] = 'None'
            if not row_info[4] in prop_list:
                new_prop(row_info)
                n_new_prop += 1
                obs_list[href].append(row_info)
                np.append(prop_list, row_info[4])
                np.append(obj_list, row_info[5])
            else:
                idx = np.where(prop_list == row_info[4])[0]
                if not row_info[5] in obj_list[idx]:
                    new_prop(row_info)
                    n_new_prop += 1
                    obs_list[href].append(row_info)
                    np.append(prop_list, row_info[4])
                    np.append(obj_list, row_info[5])
                else:
                    prop_idx = np.where(prop_list == row_info[4])[0]
                    obj_idx = np.where(obj_list == row_info[5])[0]
                    idx = list(set(prop_idx) & set(obj_idx))[0]
                    old_prop = obs_list[href][idx]
                    if old_prop[7] != row_info[7]:
                        new_obs(row_info, old_prop)
                        n_new_prop += 1
                        obs_list[href][idx][7] = row_info[7]
                        obs_list[href][idx][8] = row_info[8]
        with open(table_name + ".txt", "w") as outfile:
            print("Saving table %s to %s..." % (href, table_name + ".txt"))
            for row in obs_list[href]:
                for col in row:
                    outfile.write(col + " ")
                outfile.write("\n")
        # use this in python2
        # with open(table_name + ".csv", "w") as outfile:
        # use this in python3
        with open(table_name + ".csv", "w", newline='') as outfile:
            print("Saving table %s to %s..." % (href, table_name + ".csv"))
            csvwriter = csv.writer(outfile)
            csvwriter.writerows(obs_list[href])
        if n_new_prop != 0:
            summery += " - %d new proposal(s) in %s.\n"%(n_new_prop,table_name)
            n_new_table += 1
            n_new_prop_all += n_new_prop
    with open('hxmt_obs.json', 'w') as outfile:
        json.dump(obs_list, outfile, indent=4)
    print(" summery ".center(40,"-"))
    print(("%d Table(s) and %d proposal(s) updated "%(n_new_table,n_new_prop_all)))
    print(summery)
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值