网站链接:http://www.chinapost.com.cn/html1/folder/181312/9531-1.htm?prov=吉林
import requests
from lxml import etree
import pandas as pd
url='http://iframe.chinapost.com.cn/jsp/type/institutionalsite/SiteSearchJT.jsp?community=ChinaPostJT&prov=%E5%90%89%E6%9E%97&pos='
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.90 Safari/537.36 2345Explorer/9.3.2.17331',
'Referer': r'https://baike.baidu.com',
'Connection': 'keep-alive'
}
def cookie():
with open('cookie.txt','r') as f:
cookies={}
for line in f.read().split(';'):
name,value=line.strip().split('=',1)
cookies[name]=value
return cookies
results=pd.DataFrame()
for i in range(100):
r=requests.get(url+str(i*10),headers=headers,cookies=cookie()).text
comments=etree.HTML(r)
c1=comments.xpath('/html/body/table/tr/td[1]/text()')
c2=comments.xpath('/html/body/table/tr/td[2]/text()')
c3=comments.xpath('/html/body/table/tr/td[3]/text()')
c4=comments.xpath('/html/body/table/tr/td[4]/text()')
c5=comments.xpath('/html/body/table/tr/td[5]/text()')
c6=comments.xpath('/html/body/table/tr/td[6]/text()')
c7=comments.xpath('/html/body/table/tr/td[7]/text()')
result=pd.DataFrame([c1,c2,c3,c4,c5,c6,c7]).T
results = results.append(result)
results.to_excel('Result.xlsx',encoding='gbk')
更多爬虫实例请见 https://blog.csdn.net/weixin_39777626/article/details/81564819