用requests、BeautifulSoup简单爬取手机号码。
import requests
import lxml
from bs4 import BeautifulSoup
import time
import pandas as pd
import openpyxl
page_link=[] # 每个详情页的链接都存在这里,解析详情的时候就遍历这个列表然后访问就好啦~
def get_page_link(page_number): #输入需要爬取的页码后,得到每页下每个详情页链接
for each_number in range(1,page_number): #页码循环
full_link = 'https://www.jihaoba.com/escrow/?&_city=5&_mhead=1&page={}'.format(each_number)
get_link = requests.get(full_link)
soup = BeautifulSoup(get_link.text,'lxml')
for each_url in soup.select(' ul > li.operation > a'):
page_link.append(each_url.get('href'))
time.sleep(0.01)
get_page_link(324) #网页页码,共324页
def detail_page(src): #详情页抓取信息
url=requests.get('https://www.jihaoba.com/{s}'.format(s=src),verify=False)
soup = BeautifulSoup(url.text,'lxml')
mobiles = soup.select('div.hmxq.fright > div.pzc > div.phaoma')[0].text.replace('\r','').replace('\n','').replace('\t','')
areas= soup.select('div.pzc > div.pguishu')[0].text
prices = soup.select(' div.hmxq.fright > ul')[0].text.replace('\r','').replace('\n','').replace('\t','')
describes=soup.select('div.saler-say > div > div.saler-neirong')[0].text.replace('\r','').replace('\n','').replace('\t','').strip()
agents=soup.select('div.escrow_logo > div > p.escrow-name > a')[0].text
agent_mobiles=soup.select('div.jj_phone')[0].text.replace('\r','').replace('\n','').replace('\t','').replace('\xa0','').strip()
detail_dict={
'mobile':mobiles, #靓号
'area':areas, #区域
'price':prices, #价格、规律、特殊字符、寓意
'describe':describes, #描述
'agent':agents, #经纪人
'agent_mobile':agent_mobiles #经纪人手机号码
}
time.sleep(0.01)
return detail_dict
df1=pd.DataFrame()
for i in range(0,len(page_link)): #爬取到都信息写入都df1中
try:
dict1=detail_page(page_link[i])
df1=df1.append(pd.DataFrame(dict1,index=[0]))
except IndexError:
pass
#导出桌面
df1.to_excel(r'C:\\Users\\Administrator\\Desktop\\1.xlsx',sheet_name='sheet1')