可可心2009
发表于:2018-09-03 15:56 阅读: 494次
我们用python开发网页爬虫时,经常会遇到URL不可访问,下面记录下项目中,判断网址URL是否可用的代码。
代码主要为判断url链接是否返回404状态码的方法,废话不多说了,直接上代码:
import pymysql
import threading
import time
import urllib
import requests
# 打开数据库连接
db = pymysql.connect("192.168.0.*", "username", "password", "databasename")
# 使用 cursor() 方法创建一个游标对象 cursor
cursor = db.cursor()
# SQL 查询语句
sql = "SELECT sku,url_6 FROM url_new where flag_6 is null and url_6<>'' "
# 执行SQL语句
cursor.execute(sql)
# 获取所有记录列表
results = cursor.fetchall()
num = 0
for row in results:
sku = row[0]
url = row[1]
html = requests.head(url) # 用head方法去请求资源头
re=html.status_code
num = num + 1
print(num,re)
if re == 200:
sql_2 = "UPDATE url_new SET flag_6 = 0 WHERE sku = '%s'" % sku
try:
# 执行SQL语句
cursor.execute(sql_2)
#print(cursor.rowcount)
except Exception as e:
print(e)
conn.rollback()
if re == 404:
sql_3 = "UPDATE url_new SET flag_6 = 1 WHERE sku = '%s'" % sku
try:
# 执行SQL语句
cursor.execute(sql_3)
print(cursor.rowcount)
except Exception as e:
print(e)
conn.rollback()
db.commit()
db.close()
分类: 开发语言
34