爬取网站:http://www.xici.net.co/nn
伪代码的功能是,爬取网页上的数据,通过正则表达式,对需要的数据进行提取,并将这些数据插入到数据中。
在爬取的过程中遇到了两个问题,让我一直很头疼
一、
之前网站还可以正常提取,但后来可能用爬虫爬取的次数多了,网站可能进行了反爬虫修改,这也在程序中有所体现。这个问题纠结了好久。
二、问题
_mysql_exceptions.OperationalError: (1136, "Column count doesn't match value count at row 1")
将这些数据插入到数据库中也是遇到了很多问题,其中一个纠结我好几天,在网上搜索了很多,stackoverflow,segmentfault,知乎,csdn等等,一直不能怎么解决,最后,才发现问题所在
下面是本人写的代码
# -*- coding: utf-8 -*-
"""
Created on Mon Aug 31 20:05:25 2015
@author: wt
"""
import requests
from bs4 import BeautifulSoup
import MySQLdb
import MySQLdb.cursors
import sys
reload(sys)
sys.setdefaultencoding('utf8')
#def getInfo(url):
proxy_info = []
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36'}
page_code = requests.get('http://www.xici.net.co/nn', headers=headers).text
soup = BeautifulSoup(page_code)
table_soup = soup.find('table')
proxy_list = table_soup.findAll('tr')[1:]
conn = MySQLdb.connect(host='10.10.21.21', user='root',
passwd='123456', db='python', port = 3306, charset = 'utf8')
cur = conn.cursor()
for tr in proxy_list:
td_list = tr.findAll('td')
ip = td_list[2].string
port = td_list[3].string
location = td_list[4].string or td_list[4].find('a').string
anonymity = td_list[5].string
proxy_type = td_list[6].string
speed = td_list[7].find('div', {'class': 'bar'})['title']
connect_time = td_list[8].find('div', {'class': 'bar'})['title']
validate_time = td_list[9].string
# strip
l = [ip, port, location, anonymity, proxy_type, speed, connect_time, validate_time]
cur.execute("insert into proxy(ip, port, location, anonymity, proxy_type, speed, connect_time, validate_time) values(%s%s%s%s%s%s%s%s)", (l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7]))
print 'success connect'
conn.commit()
cur.close()
conn.close()
#!/usr/bin/python
import urllib.request
import pymysql
from bs4 import BeautifulSoup
url="http://proxy.com.ru"
soup=BeautifulSoup(urllib.request.urlopen(url),from_encoding='utf-8')
#print(soup)
tables=soup.findAll('table')
i=0
j=0
for table in tables:
if i==7:
print('开始抓取解析ip')
values=[]
f=open("ip.txt","w")
#print(table)
trs=table.findAll('tr')
for tr in trs:
if j>0:
tds=tr.findAll('td')
f.write(tds[1].text+":"+tds[2].text+"\n")
values.append(tds[1].text+":"+tds[2].text)
j=j+1
f.close()
#数据库操作
try:
conn=pymysql.connect(host='localhost',user='root',passwd='123456',db='test',
charset='utf8')
cur=conn.cursor()
sql='delete from proxy_ip;insert into proxy_ip (ip) values'
dbparam=''
for param in values:
dbparam+="('"+param+"'),"
sql+=dbparam[:-1]
print(sql)
cur.execute(sql)
conn.commit()
cur.close()
conn.close()
except pymysql.Error as e:
print("pyMysql Error {0}".format(e))
break
i=i+1
print("完成")