from requests_html import HTMLSession#必须使用session = HTMLSession()
session = HTMLSession()
import xlwt
import re
import time
xls = xlwt.Workbook()
sht1 = xls.add_sheet(‘Sheet1’)
def guolv(src):#过滤地址
url=re.findall("{’(.*?)’}",src)
return url[0]
def xiangqing(url):#获取详情
try:
r=session.get(url)
txt=r.html.find("#ContentPlaceHolder1_divNewsHtml > section:nth-child(1)", first = True)
return (txt.text)
except:
print(“详情页请求超时”)
class Deom:
print(“共抓取196页的title”)
h=0 #为了定义每条数据的行数
#196
for i in range(1,196):
url = 'http://www.chinabeauty.cn/?page=’+str(i)
print(url)
r = session.get(url)
for i in range(1,10):
h=h+1
font=r.html.find(“li.media:nth-child(”+str(i)+")",first=True)
print(font.text)
sht1.write(h, 0, font.text) #写入title
src=r.html.find(“li.media:nth-child(”+str(i)+") > div:nth-child(2) > div:nth-child(1) > h4:nth-child(1) > a:nth-child(1)",first=True).absolute_links
a=str(src) #写入地址
sht1.write(h, 1, a)#在第h行第2列写入链接地址
print(guolv(a))
print(“下面是详情”)
connet=xiangqing(guolv(a))
print(connet)
sht1.write(h, 2,connet) #在第h行第3列写入详情信息
print("-----------------------------------")
xls.save(‘C:/Users/Administrator/Desktop/美妆网.xls’)
print(“运行完成,请关闭退出”)
time.sleep(9999)
if name == ‘main’:
index=Deom