爬虫代码(TJ)

getip.py来自https://mp.csdn.net/postedit/99288836 

import getip
import re
import cx_Oracle
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pymysql
# 获取若干需爬取的网页用来测试
testurl=[
    "https://www.tujia.com/gongyu/hangzhou/1/",
    "https://www.tujia.com/gongyu/hangzhou/2/"
]
testur2=[
    "https://www.tujia.com/detail/12690196.htm",
    "https://www.tujia.com/detail/11146003.htm"
]
thisapi = 'http://ip.11jsq.com/index.php/api/entry?method=proxyServer.generate_api_url&packid=0&fa=0&fetch_key=&groupid=0&qty=1&time=100&pro=&city=&port=1&format=txt&ss=1&css=&dt=1&specialTxt=3&specialJson='
ip, ua = getip.check(0,thisapi,testurl)
conn = pymysql.connect(host='localhost', port=3306, user='root', passwd='******', db='hzdz', charset='utf8')
cur = conn.cursor()

# conn = cx_Oracle.connect('***', '***', 'localhost:1521/orcl')  # 连接数据库
# cur = conn.cursor()  # 获取cursor
count = 1671
for j in range(776,786 ):

    url = "https://www.tujia.com/gongyu/hangzhou/" + str(j) + '/'
    for i in range(0,2):
        try:
            ip, ua = getip.check(ip, thisapi,testurl)
            getip.install(ip, ua)

            data1 = urllib.request.urlopen(url).read()
            data = data1.decode("utf-8", "ignore")
            if(len(data)<3000):
                continue
            else:
                print("----当前IP有效--------")
                #print(type(data))
                pat = '<div class="label-tag">.*?<div class="noMmpty">.*?</div>.*?</div><a class="house-detail-link" href="(.*?)" target="_blank">'
                rst = re.compile(pat, re.S).findall(data)
                print(rst)
                for link in rst:
                    print(link)

                    for i in range(0, 2):
                        try:
                            ip, ua = getip.check(ip, thisapi, testurl)
                            getip.install(ip, ua)
                            chrome_options = Options()
                            chrome_options.add_argument('--headless')
                            chrome_options.add_argument('--disable-gpu')
                            driver = webdriver.Chrome(
                                executable_path=r'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe',
                                chrome_options=chrome_options)
                            driver.get(link)
                            res = driver.page_source
                            #print(driver.page_source)
                            driver.close()
                           # print("len: %d" % len(res))
                            if (len(res) < 200000):
                                continue
                            else:
                                print("----内链接当前IP有效--------")

                                soup = BeautifulSoup(res, 'lxml')
                                name = soup.find_all('span', attrs={'class': 'title__name'})
                                price = soup.find_all('span', attrs={'class': 'price__count'})

                                   # print("pri%d"%pri)
                                #print(num)
                                #print(price[0].text)
                                add = soup.find('address', attrs={'class': 'unit-title__address'}).find_all('span')
                                chatName = soup.find('div', attrs={'class': 'unit-contact__land__main'}).find_all('a')
                                des = soup.find_all('div', attrs={'class': 'unit-description simple'})
                                if len(des) < 1 :
                                    des = soup.find_all('div', attrs={'class': 'unit-description'})
                                dess = des[0].text
                                dess = re.sub(r'\n', '', dess)

                                list_td = []
                                list_td.append(name[0].text)
                                list_td.append(add[0].text)
                                list_td.append(dess)
                                # print(name[0].text)
                                # print(add[0].text)
                                # print(dess)
                                temp_pri = ['9 ', '1', '2', '4', '6 ', '8', '1', '3', '5', '7']
                                num = ""
                                for pri in range(0, len(price[0].text)):
                                    index = int((price[0].text)[pri])

                                    num += temp_pri[index]
                                num = re.sub(r' ', '', num)
                                list_td.append(num)
                                price_finall = int(list_td[3])
                                # print(price_finall)
                                list_td.append(chatName[0].text)
                                list_td.append("途家网")
                                # print(chatName[0].text)

                                #print(list_td)
                                if list_td.__len__() > 4:
                                    try:

                                        cur.execute("USE hzdz")
                                        cur.execute('SET NAMES utf8')
                                        cur.execute('SET CHARACTER SET utf8')
                                        cur.execute('SET character_set_connection=utf8')
                                        #temp = "cast( '"+list_td[3] + "'as SIGNED INTEGER)"
                                        # test ="to_date('"+list_td[1]+" ','yyyy/mm/dd')"
                                        #print(type(temp))
                                        # print(type("0"+list_td[0]))
                                        # print(type("1"+list_td[1]))
                                        # print(type("2"+ list_td[2]))
                                        # print(type("3"+list_td[3]))
                                        # print(type("4"+ list_td[4]))
                                        # print(type("5"+list_td[5]))
                                     # in_sql = "insert into hzdzsj values ('" + list_td[0].strip() + "','" +  list_td[1].strip()+ "','" + list_td[2].strip() + "','" + list_td[3].strip() + "','" + list_td[4].strip() + "','" +list_td[5].strip()+ "')"
                                        #in_sql = "insert into hzdzsj values ('" + list_td[0] + "','" + list_td[1] + "','" + list_td[2] + "',to_number('" + list_td[3] + "') ,'" + list_td[4] + "','" + list_td[5] + "')"
                                        cur.execute(
                                            'insert into hzdzsj (标题,地址,房屋描述,租金,房东ID,网站) values(%s,%s,%s,%s,%s,%s)',
                                            [list_td[0], list_td[1], list_td[2],price_finall, list_td[4], list_td[5]])

                                        # print(type(pri))
                                        # cur.execute(
                                        #     'insert into hzdzsj (标题,地址,房屋描述,租金,房东ID,网站) values(%s,%s,%s,%s,%s,%s)',
                                        #     [name[0].text, add[0].text,dess,price_finall ,chatName[0].text,"途家网"])

                                        #print(in_sql)
                                        #cur.execute(in_sql)
                                        conn.commit()
                                        print("第%d" %(count) + "条数据插入成功")
                                        count = count + 1
                                    except Exception as e:
                                        print(e)
                                    finally:
                                        pass

                                break
                        except Exception as err:
                            print(err)
                            print("-----------内链接出现异常,准备重试-------------")
                # conn.commit()
                print("----------------第%d"%(j)+"页插入成功--------------")
                break
        except Exception as err:
            print(err)
            print("-----------出现异常,准备重试-------------")
cur.close()
conn.close()

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值