直接上代码:
import requests
from bs4 import BeautifulSoup
def get_content(url):
user_agent='Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0'
response=requests.get(url,headers={'User-Agent':user_agent})
response.raise_for_status()
response.encoding=response.apparent_encoding
return response.content
def parser_html(htmlcontent):
soup=BeautifulSoup(htmlcontent,'html.parser')
tableol=soup.find_all('table',{'id':'ip_list'})[0]
trobjs=tableol.find_all('tr',{'class':'odd'})
# print(trobjs)
for trobj in trobjs:
# print(trobj)
ipname=trobj.find_all('td',limit=3)[1].get_text()
ipport=trobj.find_all('td',limit=3)[2].get_text()
# print(ipport)
global xiciinfo
xiciinfo.append((ipname,ipport))
return xiciinfo
import openpyxl
def create_to_excel(wbname,data,sheetname='Sheet1'):
print('正在创建表格%s'%(wbname))
wb=openpyxl.Workbook()
sheet=wb.active
sheet.title=sheetname
print("正在写入数据...")
for row,item in enumerate(data):
# print(item)
cell=sheet.cell(row=row+1,column=1,value=item)
cell.value=item
wb.save(wbname)
print('保存工作薄%s成功...'%(wbname))
if __name__ == '__main__':
xiciinfo=[]
url='https://www.xicidaili.com/'
htmlcontent=get_content(url)
contents=parser_html(htmlcontent)
# mulu=zip(contents)
mulu=[]
# print(mulu)
for content in contents:
dd=content[0]+':'+content[1]
# print(dd)
mulu.append(dd)
# print(mulu)
create_to_excel('doc/ipaddr.xlsx',mulu,sheetname='西刺IP池')