首先效果展示:
刚运行的界面
下面源代码部分:
from bs4 import BeautifulSoup #网页解析,获取数据
import re #正则表达式,进行文字匹配
import urllib.request,urllib.error,urllib #制定URL,获取网页数据
import xlwt #进行exclel操作
import sqlite3 #进行SQLite数据库操作
import io
import sys
import csv
import requests
from lxml import etree #使用xpath
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8') #改变标准输出的默认编码
import PySide2
from PySide2.QtWidgets import QApplication, QMessageBox
from PySide2.QtUiTools import QUiLoader
class Stats:
BASEURL = "https://chengdu.anjuke.com/sale/"
def __init__(self):
# 从文件中加载UI定义
# 从 UI 定义中动态 创建一个相应的窗口对象
# 注意:里面的控件对象也成为窗口对象的属性了
# 比如 self.ui.button , self.ui.textEdit
self.ui = QUiLoader().load('F:\\Azhong\\spider\\anjuke_ui_item\\ajk_item.ui')#若使用相对路径则在编译器运行不起
self.ui.pushButton.clicked.connect(self.search_clicked)
self.ui.pushButton_2.clicked.connect(self.create_csvfile_clicked)
def askURL(self,url):
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36",
}
#获取html方法1
# request = urllib.request.Request(url,headers=headers)
# html = ""
# response = urllib.request.urlopen(request)
# html = response.read().decode("utf-8")
#获取html方法2
request = urllib.request.Request(url,headers=headers)
response = requests.get(url,headers = headers)
html = response.text #返回的是经过解码后的字符串,str(unicode)类型
#html = response.content #返回是没经过处理的字符串,是bytes类型
# with open("fangtianxia.txt",'w',encoding='utf-8') as f:
# f.write(html)
return html
def spider(self,baseurl,pages,key):
house_info = []
kw = {'kw':key}
kw = urllib.parse.urlencode(kw)
for i in range(1,int(pages)):
url = baseurl +'p' + str(i) + '/?' + kw
text = self.askURL(url)
# with open("anjuke.txt",'w',encoding='utf-8') as f:
# f.write(text)
# print(text)
html = etree.HTML(text)
with open("111.txt",'w',encoding='utf-8') as f:
f.write(str(text))
li_list = html.xpath('//ul[@class = "houselist-mod houselist-mod-new"]/li')
for li in li_list:
data = {}
data['title'] = li.xpath('.//div[@class = "house-title"]/a/@title')
data['house_status'] = li.xpath('.//div[@class = "details-item"][1]/span/text()')
data['address'] = li.xpath('.//div[@class = "details-item"][2]/span/text()')
data['address'] = [re.sub(r'\s|\xa0','',i) for i in data['address']]#将换行什么的符号替换成空格
data['address'] = [i for i in data['address'] if len(i) > 0]#将空格号去掉
data['total_price'] = li.xpath('.//div[@class = "pro-price"]/span[1]/strong/text()')
data['unit_price'] = li.xpath('.//div[@class = "pro-price"]/span[2]/text()')
house_info.append(data)
return house_info
def save_csv(self,house_info):
headers = ["title","house_status","address","total_price","unit_price"]
# value = [
# {"名称":"11","信息":"11","地址":"11","总价(万)":"11","单位价格":"11"},
# {"名称":"112","信息":"211","地址":"121","总价(万)":"121","单位价格":"121"},
# {"名称":"131","信息":"131","地址":"131","总价(万)":"113","单位价格":"131"},
# {"名称":"141","信息":"114","地址":"114","总价(万)":"141","单位价格":"141"},
# ]
with open("house.csv",'w',encoding='utf-8',newline='') as f:#newline 是指每写入一行会给一个空字符,不写会每写一行空一行
writer = csv.DictWriter(f,headers)
#写入表头需要调用writeheader方法
writer.writeheader()
writer.writerows(house_info)
def search_clicked(self):
baseurl = "https://chengdu.anjuke.com/sale/"
key = self.ui.lineEdit.text()
pages = self.ui.spinBox.value()
self.house_info = self.spider(baseurl,str(pages + 1),key)
# with open("222.txt",'w',encoding='utf-8') as f:
# f.write(str(self.house_info))
self.ui.tableWidget.setRowCount(len(self.house_info))#设定列表行数
line = 0
for house in self.house_info:
row = 0
for i in house:
self.ui.tableWidget.setItem(line,row, PySide2.QtWidgets.QTableWidgetItem(str(house[i])))
row = row + 1
line = line + 1
def create_csvfile_clicked(self):
self.save_csv(self.house_info)
app = QApplication([])
stats = Stats()
stats.ui.show()
app.exec_()
用pyinstaller把项目打包,可以让其他人使用
1.先安装 pip install pyinstaller
2.在项目路径下输入pyinstaller httpclient.py --noconsole --hidden-import PySide2.QtXml
这样就会在当前目录下产生一个名为 dist 的目录。里面就有一个名为 httpclient 的目录,我们的可执行程序 httpclient.exe 就在里面。
其中
–noconsole 指定不要命令行窗口,否则我们的程序运行的时候,还会多一个黑窗口。 但是我建议大家可以先去掉这个参数,等确定运行成功后,再加上参数重新制作exe。因为这个黑窗口可以显示出程序的报错,这样我们容易找到问题的线索。
–hidden-import PySide2.QtXml 参数是因为这个 QtXml库是动态导入,PyInstaller没法分析出来,需要我们告诉它,
注意:由于我导入ui的路径是绝对路径,弊端是在那个路径下需要有ui文件,否则运行不了,如果源代码导入使用相对路径,则需要把ui复制到dict文件下面。
缺点:
-
.在编译器直接运行不起:
RuntimeError: Unable to open/read ui device
(self.ui = QUiLoader().load(‘ajk_item.ui’)改为self.ui = QUiLoader().load(‘F:\Azhong\spider\anjuke_ui_item\ajk_item.ui’)就可以在编译器运行) -
网站设有人机检验,有时需要手动进入网站进行检验,没有深究爬虫解决方法,日后有机会可以改进,博主觉得是否设置多个user-agent"随机选择一个发送请求就可以解决此问题,只是想法,未曾验证。
此程序是自学,非专业人士,可交流,轻喷.