以成都地铁官微为例,爬虫爬取成都地铁官微的粉丝数量:
# pyautogui通过模板匹配,找到新浪微博账户显示粉丝数量的位置,
# 然后双击使得数字处于选中状态,再用pyperclip获取粘贴板数字。
import time
import pandas as pd
from tqdm import tqdm
import pyautogui
import webbrowser as wb
from selenium import webdriver
import pyperclip
# 启动程序前先打开浏览器,且使浏览器窗口最大化。
def get_city_data(city):
url = f'https://weibo.com/{city[1]}'
# chromepath = r'D:\program\chromedriver_win32\chromedriver.exe'
# driver = webdriver.Chrome(executable_path=chromepath)
# driver.get(url)
# driver.close()
# driver.quit()
wb.open(url=url)
pyautogui.sleep(10)
# loc.png需要事先打开微博截取,作为目标的模板匹配图片。
# 不同机器不同分辨率loc.png尺寸大小不同
locate = pyautogui.locateOnScreen('loc.png')
center_x, center_y = pyautogui.center(locate)
HIGHT = 38
pyautogui.moveTo(center_x, center_y - HIGHT)
pyautogui.sleep(1)
pyautogui.doubleClick()
pyautogui.sleep(1)
pyautogui.hotkey('ctrl', 'c')
num = pyperclip.paste()
return city[0], int(num), time.strftime('%Y-%m-%d %H:%M', time.localtime())
def main():
city = [
('成都', '2384889627'),
]
city_data = []
pbar = tqdm(total=len(city), leave=True)
for c in city:
result = get_city_data(c)
print(result)
city_data.append(list(result))
pbar.update(1)
col = ['城市', '粉丝数量', '统计时间']
df = pd.DataFrame(data=city_data, columns=col)
df = df.sort_values(by=col[1], axis=0, ascending=False) # 降序
# 排序后重置index,
# 否则索引是混乱的
df = df.reset_index(drop=True)
# 因为默认的pandas起始索引从0开始,
# 为了使数据行的初始索引(起始索引index)从1开始
df.index = df.index + 1
print(df.head(10))
df.to_excel('city.xls', encoding='utf-8')
df.to_csv('city.csv', encoding='utf-8')
if __name__ == '__main__':
main()
输出: