初探python爬虫

from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup #解析网页的库
from selenium import webdriver #模拟执行js,需配合phantomjs使用
import time
import json
import pymysql
import re

try:
    # html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
    # html = urlopen("http://www.pythonscraping.com/pages/page3.html")
    html = urlopen("http://fangjia.fang.com/zz/")
except HTTPError as e:
    print(e)
# line = html.read().decode('utf-8')
# print(line)
# print(line.decode('gb18030'))
# print(html.read().decode('utf-8'))

# bsObj = BeautifulSoup(line)

# <find and findAll>

# nameList = bsObj.findAll("span", {"class": "green"})
# for name in nameList:
#     print(name.get_text())

# <children and descendants>
# for child in bsObj.find("div", {"id": "_container"}).descendants:
#     print(child)

# for container in bsObj.find("div", {"id": "_container"}):
#     add=container.get_text()
#     print(add)



# <selenium>
url = "http://fangjia.fang.com"
driver = webdriver.PhantomJS(executable_path='D:\\phantomjs-2.1.1-windows\\bin\\phantomjs')
driver.get(url+"/zz/")
time.sleep(3)
iframe= driver.find_element_by_tag_name("iframe")
map= iframe.get_attribute("src")
driver.get(map)
time.sleep(3)

# 得到网页源代码,还用beautifulSoup解析
# pageSource = driver.page_source
# bsObj=BeautifulSoup(pageSource)

for container in driver.find_elements_by_id("_container"):
    text=re.sub("\\n"," ",container.text)
    if text!="":
        print(text)
        print("***************************")

driver.close()










def mysqltest():
    conn = pymysql.connect(host='127.0.0.1', user='root', passwd='root', db='mysql')
    cur = conn.cursor()
    cur.execute("use mysql")
    cur.execute("select * from user")
    print(cur.fetchall())
    cur.close()
    conn.close()

# mysqltest()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值