from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup #解析网页的库
from selenium import webdriver #模拟执行js,需配合phantomjs使用
import time
import json
import pymysql
import re
try:
# html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
# html = urlopen("http://www.pythonscraping.com/pages/page3.html")
html = urlopen("http://fangjia.fang.com/zz/")
except HTTPError as e:
print(e)
# line = html.read().decode('utf-8')
# print(line)
# print(line.decode('gb18030'))
# print(html.read().decode('utf-8'))
# bsObj = BeautifulSoup(line)
# <find and findAll>
# nameList = bsObj.findAll("span", {"class": "green"})
# for name in nameList:
# print(name.get_text())
# <children and descendants>
# for child in bsObj.find("div", {"id": "_container"}).descendants:
# print(child)
# for container in bsObj.find("div", {"id": "_container"}):
# add=container.get_text()
# print(add)
# <selenium>
url = "http://fangjia.fang.com"
driver = webdriver.PhantomJS(executable_path='D:\\phantomjs-2.1.1-windows\\bin\\phantomjs')
driver.get(url+"/zz/")
time.sleep(3)
iframe= driver.find_element_by_tag_name("iframe")
map= iframe.get_attribute("src")
driver.get(map)
time.sleep(3)
# 得到网页源代码,还用beautifulSoup解析
# pageSource = driver.page_source
# bsObj=BeautifulSoup(pageSource)
for container in driver.find_elements_by_id("_container"):
text=re.sub("\\n"," ",container.text)
if text!="":
print(text)
print("***************************")
driver.close()
def mysqltest():
conn = pymysql.connect(host='127.0.0.1', user='root', passwd='root', db='mysql')
cur = conn.cursor()
cur.execute("use mysql")
cur.execute("select * from user")
print(cur.fetchall())
cur.close()
conn.close()
# mysqltest()
初探python爬虫
最新推荐文章于 2024-10-20 19:52:39 发布