本程序适用于学习爬虫的小伙伴们,如果感觉有用的话就点个赞吧
import numpy as np
import requests,re
from bs4 import BeautifulSoup
def names(url):
a = []
b=[]
r = requests.get(url,timeout=30, headers=myHeader)
r.raise_for_status()
r.encoding = 'utf-8'
html = r.text
soup = BeautifulSoup(html, "html.parser")
dt = soup.find_all('a', attrs={'class': 'journal-item cf'})
for i in dt:#将地址添加进数组a
a.append('https://you.ctrip.com'+i.get('href'))
for i in a:#将带有nanjing字样的地址添加进数组b
if 'nanjing' in i:
b.append(i)
return b
def spans(i,url,count):#返回span标签中的内容
try:
a=[]
r = requests.get(url, headers=myHeader)
r.raise_for_status()
r.encoding = 'utf-8'
html = r.text
soup = BeautifulSoup(html, "html.parser")
div=soup.find('div',attrs={'class':'ctd_content_controls cf'})
span1=div.find_all('span')
for i in span1:
a.append(re.sub(u"\\<.*?\\>",'',str(i)))
return a
except Exception as ex:
a=''
print("第{}页{}行出错:{}".format(i,count,ex))
return a
def values(span):
result = []
result1 ='无 '
result2 =result4= ' 无 '
result3 = ' 无 '
result5 = ' '
for i in range(len(span)):
i = re.sub(u"\\<.*?\\>", '', span[i]).replace(' ', '')
if i[0:3] == '天数:':
result1 = i[3:]
if i[0:3] == '时间:':
result2 = i[3:]
if i[0:3] == '人均:':
result3 = i[3:]
if i[0:3] == '和谁:':
result4 = i[3:]
if i[0:3] == '玩法:':
result5 = i[3:].replace('玩法:', '')
result.append(result1)
result.append(result2)
result.append(result3)
result.append(result4)
result.append(result5)
return result
def dls(url):
try:
a = ''
r = requests.get(url, headers=myHeader)
r.raise_for_status()
r.encoding = 'utf-8'
html = r.text
soup = BeautifulSoup(html, "html.parser")
dl1 = soup.find('div', attrs={'class': 'author_poi'})
dd=dl1.find_all('dd')
for i in dd:
a1=i.find('a')
a=a+a1.string+' '
return a
except Exception as ex:
a=' '
return a
def money(all_value):
a=0
j=0
for i in all_value:
if i[2]!=' 无 ':
a=a+int(str(i[2]).replace('元',''))
j+=1
return a/j
def month(all_value):
a=[0,0,0,0,0,0,0,0,0,0,0,0]
s=[]
for i in all_value:
if i[1]!=' 无 ':
a[int(str(i[1]).replace('月',''))-1]+=1
for i in range(len(a)):
if a[i]==np.max(a):
s.append(str(i+1)+'月')
return s
all_value=[]
myHeader = {
# 'cookie': '_bfaStatusPVSend=1; GUID=09031062418555412721; MKT_CKID=1682154253538.ghfq4.safz; MKT_CKID_LMT=1682154253539; _RF1=171.43.199.20; _RSG=PiK6z9hdSJFo3lsrg5ZEVA; _RDG=289b9c2b6789dc213c37b26cedac3d3ed5; _RGUID=f4dc9eb2-7d19-4b19-a7ce-8c5ab5daf0f2; StartCity_Pkg=PkgStartCity=12; nfes_isSupportWebP=1; MKT_Pagesource=H5; Union=OUID=&AllianceID=66672&SID=1693366&SourceID=&AppID=&OpenID=&exmktID=&createtime=1682154693&Expires=1682759492985; _ga=GA1.2.1531182810.1682154693; _gid=GA1.2.850295264.1682154693; _jzqco=%7C%7C%7C%7C1682154254211%7C1.1296621933.1682154253459.1682154466124.1682154942512.1682154466124.1682154942512.undefined.0.0.11.11; __zpspc=9.1.1682154253.1682154942.11%234%7C%7C%7C%7C%7C%23; _bfs=1.16; _bfi=p1%3D290570%26p2%3D0%26v1%3D21%26v2%3D0; _bfaStatus=success; Hm_lvt_37b54c42b9dde393e60c88c1a84657cb=1682154691,1682158013; Hm_lpvt_37b54c42b9dde393e60c88c1a84657cb=1682158013; _bfa=1.1682154248949.2o62i6.1.1682154248949.1682158014247.2.22.10320607121; _ubtstatus=%7B%22vid%22%3A%221682154248949.2o62i6%22%2C%22sid%22%3A2%2C%22pvid%22%3A22%2C%22pid%22%3A10320607121%7D',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'}
for i in range(1,5):
with open("南京游记.txt", 'a+', encoding='utf-8') as file:
file.writelines('第{}页'.format(i))
file.write("\n")
url = "https://you.ctrip.com/travels/nanjing9/t3-p{}.html".format(i)
name=names(url)
count = 0
for j in name:
count+=1
span=spans(i,j,count)
if span!='':
value=values(span)
dl=dls(j)
value.append(dl)
with open("南京游记.txt",'a+',encoding='utf-8') as file:
for k in value:
file.write(k+' ')
file.write('\n')
all_value.append(value)
print(f'第{i}页打印成功')
print('平均花费',money(all_value))
print("推荐月份",month(all_value))
#
# url = "https://you.ctrip.com/travels/nanjing9/t3-p{}.html".format(3)
# name=names(url)
# for j in name:
# span=spans(j)
# if span!='':
# value=values(span)
# dl=dls(j)
# value.append(dl)
# all_value.append(value)
# print(all_value)
个人博客地址:https://beiliu266.github.io