# coding=utf-8
import json
import requests
from bs4 import BeautifulSoup
import db
# 获取data
def get_data(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'};
response = requests.post(url, headers=headers, verify=False);
response.encoding = 'utf-8';
text = response.text;
# text = text.replace(' ', '');
# text = text.replace(' ', '');
text = text.replace('\n', '');
text = text.replace('\r', '');
return text;
# 数据写入文件
def writeData(listData):
db.dbInsert(listData)
# file = open("jd.txt", "a+", encoding='utf-8');
# for data in listData:
# writeData = json.dumps(data, ensure_ascii=False);
# file.writelines(writeData);
# file.writelines('\n');
# file.flush();
# file.close();
# 分析出想要的数据
# 返回的数据格式数list<Object>
def analyzeData(html):
listData = [];
soup = BeautifulSoup(html, 'lxml');
contentList = soup.select('div.p-name');
for content in contentList:
detail = {};
detail['key'] = content.select('a')[0].attrs.get('href');
detail['value'] = content.text.strip();
listData.append(detail);
return listData;
page = 0;
lastKey = '';
while True:
# 分页获取数据
page = page + 1;
url = "";
data = get_data(url);
listData = analyzeData(data);
lastKeyTemp = listData[len(listData) - 1].get('key');
if lastKeyTemp == lastKey:
break;
else:
lastKey = lastKeyTemp;
writeData(listData);
import psycopg2
def dbInsert(listData):
conn = psycopg2.connect(database='', user='', password='',
host='',
port='5432')
cur = conn.cursor()
for data in listData:
try:
cur.execute(r"""INSERT INTO t_data ("key", "value") VALUES('""" + data['key'] + """', '""" + data[
'value'] + """')""")
except psycopg2.Error:
continue
conn.commit()
# print('insert successfully')
conn.close()