#这里导入我们需要用的模块,并连接SQLyog,并创建游标
import requests
import refromlxml import etree
import pymysql
import time
conn=pymysql.connect(host='localhost',user='root',passwd='1234',db='mydatabase1',port=3306,charset='utf8')
cursor=conn.cursor()
#获取头命令,进行伪装访问浏览器,避免爬取失败被封IP:
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
#创建一个获取网址的函数:
def get_house_url(url):
html=requests.get(url,headers=headers)#利用头命令进行伪装访问网址
selector=etree.HTML(html.text) #解析源代码,使之成为我们需要的文本文档
house_hrefs=selector.xpath('//div[@class="house-title"]/a/@href')#获取连接for house_href inhouse_hrefs:
get_house_info(house_href)
def get_house_info(url): #获取连接里面的具体信息
html=requests.get(url,headers=headers)
selector=etree.HTML(html.text) #利用头命令进行访问浏览器并把源代码解析成文本文档try:
name=selector.xpath('//*[@id="content"]/div[2]/h3/text()')[0] #以下依次都是进行抓取有效的数据,
village=selector.xpath('//*[@id="content"]/div[3]/div[1]/div[3]/div/div[1]/div/div[1]/dl[1]/dd/a/text()')[0]
price=selector.xpath('//*[@id="content"]/div[3]/div[1]/div[1]/span[1]/em/text()')[0]
style=selector.xpath('//*[@id="content"]/div[3]/div[1]/div[3]/div/div[1]/div/div[2]/dl[1]/dd/text()')[0]
area=selector.xpath('//*[@id="content"]/div[3]/div[1]/div[1]/span[3]/em/text()')[0]
unit_price=selector.xpath('//*[@id="content"]/div[3]/div[1]/div[3]/div/div[1]/div/div[3]/dl[2]/dd/text()')[0]
cursor.execute("insert into suzhou_house(name,village,price,style,area,unit_price) values(%s,%s,%s,%s,%s,%s)",(str(name),str(village),str(price),str(style),str(area),str(unit_price)))
except IndexError:
pass