Day07—homework
纸上得来终觉浅,绝知此事要躬行
眼泪你别问,joker这个 “男人” 你别恨。
清华大学官网新闻页面爬取:
程序:
import requests
import threading
import pymysql
class tsinghua(threading.Thread):
def __init__(self, target_url, total_page, lock):
threading.Thread.__init__(self)
self.target_url = target_url
self.total_page = total_page
self.db = pymysql.connect(host='127.0.0.1',
port=3306,
user='root',
password='root',
db='tsinghua')
self.id = 0
self.lock = lock
def run(self):
for page in range(1, self.total_page + 1):
if page == 1:
url = self.target_url.format('index')
else:
url = self.target_url.format('index_' + str(page))
response = requests.get(url)
response.encoding = 'utf8'
html = response.text
for line in html.split('\n'):
if 'class="jiequ"' in line:
title = line.split('class="jiequ">')[1].split('</a>')[0]
with self.lock:
self.id += 1
sql = 'INSERT INTO news (id,title) values ({},"{}");'.format(
self.id, title)
self.write_to_mysql(sql)
print('[+] writ mysql over ~')
print('[+] current page: {} ok'.format(page))
def write_to_mysql(self, sql):
cursor = self.db.cursor()
try:
cursor.execute(sql)
self.db.commit()
except Exception as e:
print(e)
self.db.rollback()
def Running():
lock = threading.Lock()
thread_1 = tsinghua(
'http://news.tsinghua.edu.cn/publish/thunews/9648/{}.html', 30, lock)
thread_1.start()
thread_1.join()
print('Over !')
if __name__ == "__main__":
Running()