def init_db():
conn = sqlite3.connect(PYPI_DB)
conn.execute('''CREATE TABLE IF NOT EXISTS PYPI
(
ID INTEGER PRIMARY KEY AUTOINCREMENT,
TIMESTAMP INTEGER NOT NULL,
YEAR TEXT NOT NULL,
MONTH TEXT NOT NULL,
DAY TEXT NOT NULL,
NAME TEXT NOT NULL,
VERSION TEXT NOT NULL,
DESCRIPTION TEXT ,
HREF TEXT NOT NULL
);
''')
conn.close()
def insert_data(tmp_dict):
try:
timestamp = int(time.time())
conn = sqlite3.connect(PYPI_DB)
conn.text_factory = str
year = tmp_dict['date'].split('-')[0]
month = tmp_dict['date'].split('-')[1]
day = tmp_dict['date'].split('-')[2]
name = tmp_dict['name']
ver = tmp_dict['ver']
dscrpt = tmp_dict['discript']
href = tmp_dict['href']
conn.executemany("INSERT INTO PYPI (TIMESTAMP,YEAR,MONTH,DAY,NAME,VERSION,DESCRIPTION,HREF) VALUES (?,?,?,?,?,?,?,?);", [(timestamp,year,month,day,name,ver,dscrpt,href),])
conn.commit()
conn.close()
except Exception, e:
logging.error('%s' % str(e)
# 每行数据示例
{
"bd433e1e73b938923cc594389e433780":{
"discript":"This is an essential package for managing AMaaS infra layer",
"ver":"0.2.47",
"name":"amaasinfra",
"href":"https://pypi.python.org/pypi/amaasinfra/0.2.47",
"pkg":"amaasinfra 0.2.47",
"date":"2017-10-01"
}
}
import os
import bs4
import json
import time
import sqlite3
import urllib2
import hashlib
import logging
from collections import deque
def get_web_data():
try:
# record_map 定长队列
# 当时首页会展示40条pypi提交记录
global record_map
res_data = urllib2.urlopen(req, timeout = 2)
res_html = res_data.read()
res_data.close()
soup = bs4.BeautifulSoup(res_html, "html5lib")
segment_flag = 0
href_flag = 0
all_href = []
for line in soup.table.select("a"):
all_href.append(line.get("href"))
for content in soup.table.select("td"):
if segment_flag == 0:
tmp_date = content.get_text().encode("utf-8")
if segment_flag == 1:
tmp_pkg = content.get_text().encode("utf-8")
tmp_name = tmp_pkg.split('\xc2\xa0')[0]
tmp_ver = tmp_pkg.split('\xc2\xa0')[1]
if segment_flag == 2:
tmp_discript = content.get_text().encode("utf-8")
tmp_href = 'https://pypi.python.org' + all_href[href_flag].encode("utf-8")
href_flag += 1
segment_flag = 0
# 判断重复记录
tmp_md5 = hashlib.md5(tmp_date + tmp_pkg + tmp_discript + tmp_href).hexdigest()
exist_flag = 1
tmp_record = {}
for i in record_map:
if tmp_md5 == i.keys()[0]:
exist_flag = 0
if exist_flag:
tmp_record[tmp_md5] = {"date":tmp_date, "pkg":tmp_pkg, "discript":tmp_discript, "href":tmp_href, "name":tmp_name, "ver":tmp_ver}
record_map.append(tmp_record)
insert_data(tmp_record[tmp_md5])
else:
segment_flag += 1
with open(RECORD_FILEPATH, 'w') as ff:
for i in record_map:
ff.write(json.dumps(i)+'\n')
except Exception, e:
logging.error('%s' % str(e))