1. 抓取影评
一般影评网站都有反爬虫机制,而且每个网站的都不尽相同,所以需要采取一些手段并根据具体网站的情况来进行爬取,本文以my为例
df = pd.DataFrame(columns=['date', 'score', 'city', 'comment', 'nick'])
for i in range(1000):
j = random.randint(1, 1000)
print(str(i) + 'th '+'download page: ' + str(j))
try:
time.sleep(2)
url = 'http://m.maoyan.com/mmdb/comments/movie/1216446.json?_v_=yes&offset=' + str(j)
html = requests.get(url=url).content
data = json.loads(html.decode('utf-8'))['cmts']
for item in data:
df = df.append({
'date': item['time'].split(' ')[0],
'city': item['cityName'],
'score': item['score'],
'comment': item['content'],
'nick': item['nick']}, ignore_index=True)
with open('./filmreview/take-my-bro3.csv', 'a+', encoding='utf-8') as fhand:
df.to