import urllib
import urllib2
import HTMLParser
from bs4 import BeautifulSoup
import re
import MySQLdb as mdb
import json
i=1 #number order of companys
def GetOnePageUrl(url):
global i
flag = 0
request = urllib2.Request(url)
html = urllib2.urlopen(request)
soup = BeautifulSoup(html, "lxml")
for link in soup.find_all(name='a', attrs={"href": re.compile(r'^http://qy.58.com/mq/[0-9]*/$')}):
#print link.get('href')
if flag%2 == 0:
GetOneUrlInfo(link.get('href'))
print i
i += 1
flag += 1
def GetOneUrlInfo(url):
global i
request = urllib2.Request(url)
html = urllib2.urlopen(request)
soup = BeautifulSoup(html,"lxml")
#for addr in soup.find_all(name='td',limit=5):
# print addr.string
fiveinfo = soup.find_all(name='td',limit=5)
if len(fiveinfo) == 0: #the company'
【第一个爬虫】python爬取58同城企业信息并插入数据库
最新推荐文章于 2024-05-27 16:52:41 发布