#!/usr/bin/python
# coding: UTF-8
import urllib, urllib2, cookielib, re
urlPrefix='http://www.somehost.com'
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
opener.addheaders = [('User-Agent', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1;)')]
f = open('1.csv', 'w')
count=0;
for i in range(280020,280840):
url=urlPrefix+"/articles/"+str(i)+".htm";
try:
indexPageContent=opener.open(url).read()
for match in re.finditer("/w{4}/s/w{4}/s/w{4}/s/w{4}", indexPageContent):
f.write(url+","+match.group())
count++
print url+"/tOK/t"+str(count)
except urllib2.HTTPError, e:
print url+"/t404/t"+str(count)
f.close()