使用yield优雅抓取网页分页数据
在使用Python来抓取网页数据的时候,常常碰到分页数据,其中部分下一页按钮已经有具体的链接地址,而另外的可能是javascript来处理分页的。这要求同时能解析页面内容,又要采集下一页的url。怎样优雅用python写这样的代码呢?或者说怎样更pythonic?
下面分别给出部分代码实例
def get_next_page(obj):
'''get next page content from a url or another content '''
error_occurred = False
for retry2 in xrange(3):
try:
if isinstance(obj, (basestring, unicode)):
resp = curr_session.get(obj, timeout=TIMEOUT, headers=headers,
cookies=cookies, allow_redirects=True)
content = resp.content
save_html_content(obj, content)
error_occurred = False
else:
content = obj
soup = BeautifulSoup(content, features='html5lib', from_encoding="utf8")
e_next_page = soup.find('a', text="下頁")
break
except:
error_occurred = True
time.sleep(2)
if error_occurred:
yield content
return
if e_next_page:
next_url = "http://www.etnet.com.hk" + e_next_page.get('href')
time.sleep(2)
yield content
for i in get_next_page(next_url):
yield i
else:
yield content
def get_next_page(obj, page=1):
'''get next page content from a url or another content '''
error_occurred = False
for retry2 in xrange(3):
try:
if isinstance(obj, (basestring, unicode)):
resp = curr_session.get(obj, timeout=TIMEOUT, headers=headers,
cookies=cookies, allow_redirects=True)
content = resp.content
save_html_content(obj, content)
hrefs = re.findall('industrysymbol=.*&market_id=[^;]+', content)
if page == 1 and (not "sh=" in obj) and hrefs:
reset_url = ("http://www.aastocks.com/tc/cnhk/market/industry"
"/sector-industry-details.aspx?%s&page=1" % \
(hrefs[0].replace('sh=1', 'sh=0').replace('&page=', '') \
.replace("'", '').split()[0]))
for next_page in get_next_page(reset_url):
yield next_page
return
error_occurred = False
else:
content = obj
soup = BeautifulSoup(content, features='html5lib', from_encoding="utf8")
e_next_page = soup.find('td', text="下一頁 ")
break
except:
error_occurred = True
LOG.error(traceback.format_exc())
time.sleep(2)
if error_occurred:
yield content
return
if e_next_page:
hrefs = re.findall('industrysymbol=.*&market_id=[^;]+', content)
if hrefs:
next_url = ("http://www.aastocks.com/tc/cnhk/market/industry/sector-industry"
"-details.aspx?%s&page=%d" % \
(hrefs[0].replace('sh=1', 'sh=0') \
.replace('&page=', '').replace("'", '').split()[0], page+1))
time.sleep(2)
yield content
for next_page in get_next_page(next_url, page+1):
yield next_page
else:
yield content
for curr_href in e_href:
retry_interval = random.randint(MIN_INTERVAL_SECONDS_FOR_RETRIEVING,
MAX_INTERVAL_SECONDS_FOR_RETRIEVING)
time.sleep(retry_interval)
contents = get_next_page(curr_href)
for content in contents:
get_page_data(content)