- selenium :自动化测试工具
- chromedriver.exe(win64位) :带界面的浏览器插件驱动
- PhantomJS :没有界面的浏览器引擎
- BeautifulSoup :解析html
- 官网地址:https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html
我的完整代码如下,带界面的谷歌浏览器驱动可以运行,但是不带界面的PhantomJS引擎不能工作,报错500
from flask import Flask
from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
from selenium import webdriver
import time
app = Flask(__name__)
@app.route('/')
def hello_world():
url = "http://www.huahaicang.cn/#/size?productId=356530511&brandId=2056402"
return getUrl(url)
def getUrl(url):
browser = webdriver.Chrome()
#browser = webdriver.PhantomJS(executable_path="D:\\CompanyProject\\PythonProject\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe")
# browser.maximize_window()
browser.get(url)
time.sleep(1)
try:
html = browser.page_source
soup = BeautifulSoup(html, "lxml")
trs = soup.tbody
trsContents = trs.contents
# 清除列表中的\n元素
while trsContents.count('\n') > 0:
trsContents.remove('\n')
print(trsContents)
dic1 = {}
# 循环tr
for tr in trsContents:
trContents = tr.contents
# 清除列表中的\n元素
while trContents.count('\n') > 0:
trContents.remove('\n')
tdName = trContents[0].string
dic1[tdName] = []
# 循环td 取第一个td的值为key,后面的为values; eg:尺码 34/XS 36/S/AS 38/M/AM
for td in trContents:
if (td != trContents[0]):
dic1[tdName].append(td.string)
print(dic1)
except:
print("no contents")
return str(dic1)
if __name__ == '__main__':
app.run()
- 参考:https://cuiqingcai.com/2652.html