我正在努力获取我的代码,从web上获取HTML表信息,以便在保存的网站列表中工作发货URL.txt文件。代码从ShipURL读取网页地址,然后转到链接并下载表数据并将其保存到csv。但我的问题是程序无法完成,因为错误“连接尝试失败,因为连接方在一段时间后没有正确响应,或建立连接失败,因为连接的主机没有响应,”在中间发生,程序停止。现在我明白了,我需要增加请求时间,使用代理或发出try语句。我浏览了一些关于同一问题的答案,但作为一个新手,我发现很难理解。任何帮助都将不胜感激。在# -*- coding: utf-8 -*-
fm = open('ShipURL.txt', 'r')
Shiplinks = fm.readlines()
import csv
from urllib import urlopen
from bs4 import BeautifulSoup
import re
for line in Shiplinks:
website = re.findall(r'(https?://\S+)', line)
website = "".join(str(x) for x in website)
if website != "":
with open('ShipData.csv','wb')as f: #Creates an empty csv file to which assign values.
writer = csv.writer(f)
shipUrl = website
shipPage = urlopen(shipUrl)
soup = BeautifulSoup(shipPage, "html.parser") #Read the web page HTML
table = soup.find_all("table", { "class" : "table1" }) #Finds table with class table1
List = []
columnRow = ""
valueRow = ""
Values = []
for mytable in table: #Loops tables with class table1
table_body = mytable.find('tbody') #Finds tbody section in table
try: #If tbody exists
rows = table_body.find_all('tr') #Finds all rows
for tr in rows: #Loops rows
cols = tr.find_all('td') #Finds the columns
i = 1 #Variable to control the lines
for td in cols: #Loops the columns
## print td.text #Displays the output
co = td.text #Saves the column to a variable
## writer.writerow([co]) Writes the variable in CSV file row
if i == 1: #Checks the control variable, if it equals to 1
if td.text[ -1] == ":":
# võtab kooloni maha ja lisab koma järele
columnRow += td.text.strip(":") + "," # Tekkis mõte, et vb oleks lihtsam kohe ühte string panna
List.append(td.text) #.. takes the column value and assigns it to a list called 'List' and..
i+=1 #..Increments i by one
else:
# võtab reavahetused maha ja lisab koma stringile
valueRow += td.text.strip("\n") + ","
Values.append(td.text) #Takes the second columns value and assigns it to a list called Values
#print List #Checking stuff
#print Values #Checking stuff
except:
print "no tbody"
# Prindime pealkirjad ja väärtused koos reavahetusega välja ka :)
print columnRow.strip(",")
print "\n"
print valueRow.strip(",")
# encode'ing hakkas jälle kiusama
# Kirjutab esimeseks reaks veeru pealkirjad ja teiseks väärtused
writer.writerow([columnRow.encode('utf-8')])
writer.writerow([valueRow.encode('utf-8')])