【一天一个Python小案例】网页表格抓取
import requests, json, os, csv
from bs4 import BeautifulSoup
import numpy as np
def new_prop(prop_info):
print(" new proposal ".center(20, "-"))
print(" - num: %s\n"
" - Source Name: %s\n"
" - Proposal ID: %s\n"
" - Target ID: %s\n"
" - Number of Obs.: %s\n"
" - Obs. Time(ks): %s" % (
prop_info[0], prop_info[1], prop_info[4], prop_info[5], prop_info[7], prop_info[8]))
def new_obs(prop_info, old_prop):
print(" new observation ".center(20, "-"))
print(" - num: %s\n"
" - Source Name: %s\n"
" - Proposal ID: %s\n"
" - Target ID: %s\n"
" - Number of Obs.: %s -> %s\n"
" - Obs. Time(ks): %s -> %s" % (
prop_info[0], prop_info[1], prop_info[4], prop_info[5], old_prop[7], prop_info[7], old_prop[8],
prop_info[8]))
if __name__ == '__main__':
if os.path.isfile("hxmt_obs.json"):
first_run = False
print("Searching new observations...")
with open("hxmt_obs.json", "r") as infile:
obs_list = json.load(infile)
else:
first_run = True
print("Saving HXMT observation table to hxmt_obs.json...")
obs_list = {}
hxmt = "http://www.hxmt.org"
url0 = "http://www.hxmt.org/ObsSrcList.jhtml"
rsp = requests.get(url0)
soup = BeautifulSoup(rsp.text, 'html.parser')
lists = soup.select("div.hy_hxmt_title > a")
n_new_table = 0
n_new_prop_all = 0
summery = ""
for ll in lists:
href = ll.get("href")
if href in obs_list.keys():
new_table = False
print(ll.get_text())
prop_list = np.array(obs_list[href])[:, 4]
obj_list = np.array(obs_list[href])[:, 5]
else:
new_table = True
print("Adding new table %s..." % ll.get_text())
obs_list[href] = []
prop_list = []
obj_list = []
url_list = hxmt + href
table_name = ll.get_text()
table_name = table_name[table_name.index("AO"):table_name.index("AO")+4]
rsp = requests.get(url_list)
soup = BeautifulSoup(rsp.text, 'html.parser')
obs_table = soup.select("table")[0]
obs_rows = obs_table.find_all(style=";height:19px")
n_new_prop = 0
for row in obs_rows:
row_info = []
for child in row.children:
text = child.get_text()
text = text.replace("\u00a0", "")
row_info.append(text)
if len(row_info) == 8:
row_info.append('')
for n in [8, 7, 6, 5]:
row_info[n] = row_info[n - 1]
row_info[4] = 'None'
if row_info[-1] == '':
row_info[-1] = 'None'
if not row_info[4] in prop_list:
new_prop(row_info)
n_new_prop += 1
obs_list[href].append(row_info)
np.append(prop_list, row_info[4])
np.append(obj_list, row_info[5])
else:
idx = np.where(prop_list == row_info[4])[0]
if not row_info[5] in obj_list[idx]:
new_prop(row_info)
n_new_prop += 1
obs_list[href].append(row_info)
np.append(prop_list, row_info[4])
np.append(obj_list, row_info[5])
else:
prop_idx = np.where(prop_list == row_info[4])[0]
obj_idx = np.where(obj_list == row_info[5])[0]
idx = list(set(prop_idx) & set(obj_idx))[0]
old_prop = obs_list[href][idx]
if old_prop[7] != row_info[7]:
new_obs(row_info, old_prop)
n_new_prop += 1
obs_list[href][idx][7] = row_info[7]
obs_list[href][idx][8] = row_info[8]
with open(table_name + ".txt", "w") as outfile:
print("Saving table %s to %s..." % (href, table_name + ".txt"))
for row in obs_list[href]:
for col in row:
outfile.write(col + " ")
outfile.write("\n")
with open(table_name + ".csv", "w", newline='') as outfile:
print("Saving table %s to %s..." % (href, table_name + ".csv"))
csvwriter = csv.writer(outfile)
csvwriter.writerows(obs_list[href])
if n_new_prop != 0:
summery += " - %d new proposal(s) in %s.\n"%(n_new_prop,table_name)
n_new_table += 1
n_new_prop_all += n_new_prop
with open('hxmt_obs.json', 'w') as outfile:
json.dump(obs_list, outfile, indent=4)
print(" summery ".center(40,"-"))
print(("%d Table(s) and %d proposal(s) updated "%(n_new_table,n_new_prop_all)))
print(summery)