在github上找到了一个twitter的爬虫,试了下,修改了其中一个有关编码的问题,可以抓取一定数量的twitter
https://gist.github.com/TVFlash/cccc2808cdd9a04db1ce
代码如下
from bs4 import BeautifulSoup, NavigableString
from urllib2 import urlopen
#Note: must be a public profile
print "Twitter username:"
user = raw_input()
endpoint = "https://twitter.com/%s"
f = urlopen(endpoint % user)
html = f.read()
f.close()
soup = BeautifulSoup(html, 'html.parser')
tweets = soup.find_all('strong', {'class': 'fullname js-action-profile-name show-popup-with-id'})
for i in range(0,len(tweets)):
user = tweets[i].contents[0]
action_tag = soup('span', {'class': 'username js-action-profile-name'})
show_name = action_tag[i].content