初探python爬虫

最新推荐文章于 2024-10-20 19:52:39 发布

梦忆柯南

最新推荐文章于 2024-10-20 19:52:39 发布

阅读量342

点赞数

分类专栏： Python 文章标签： python 爬虫

本文链接：https://blog.csdn.net/zlxtk/article/details/53842301

版权

Python 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup #解析网页的库
from selenium import webdriver #模拟执行js,需配合phantomjs使用
import time
import json
import pymysql
import re

try:
    # html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
    # html = urlopen("http://www.pythonscraping.com/pages/page3.html")
    html = urlopen("http://fangjia.fang.com/zz/")
except HTTPError as e:
    print(e)
# line = html.read().decode('utf-8')
# print(line)
# print(line.decode('gb18030'))
# print(html.read().decode('utf-8'))

# bsObj = BeautifulSoup(line)

# <find and findAll>

# nameList = bsObj.findAll("span", {"class": "green"})
# for name in nameList:
#     print(name.get_text())

# <children and descendants>
# for child in bsObj.find("div", {"id": "_container"}).descendants:
#     print(child)

# for container in bsObj.find("div", {"id": "_container"}):
#     add=container.get_text()
#     print(add)



# <selenium>
url = "http://fangjia.fang.com"
driver = webdriver.PhantomJS(executable_path='D:\\phantomjs-2.1.1-windows\\bin\\phantomjs')
driver.get(url+"/zz/")
time.sleep(3)
iframe= driver.find_element_by_tag_name("iframe")
map= iframe.get_attribute("src")
driver.get(map)
time.sleep(3)

# 得到网页源代码，还用beautifulSoup解析
# pageSource = driver.page_source
# bsObj=BeautifulSoup(pageSource)

for container in driver.find_elements_by_id("_container"):
    text=re.sub("\\n"," ",container.text)
    if text!="":
        print(text)
        print("***************************")

driver.close()










def mysqltest():
    conn = pymysql.connect(host='127.0.0.1', user='root', passwd='root', db='mysql')
    cur = conn.cursor()
    cur.execute("use mysql")
    cur.execute("select * from user")
    print(cur.fetchall())
    cur.close()
    conn.close()

# mysqltest()