412 lines
15 KiB
Python
412 lines
15 KiB
Python
"""
|
||
Coursera 課程備份工具
|
||
功能:下載課程影片、文字內容和字幕
|
||
"""
|
||
|
||
import os
|
||
import json
|
||
import time
|
||
import re
|
||
import random
|
||
from pathlib import Path
|
||
from urllib.parse import urljoin, urlparse
|
||
import requests
|
||
from bs4 import BeautifulSoup
|
||
from selenium import webdriver
|
||
from selenium.webdriver.common.by import By
|
||
from selenium.webdriver.support.ui import WebDriverWait
|
||
from selenium.webdriver.support import expected_conditions as EC
|
||
from selenium.webdriver.chrome.service import Service
|
||
from selenium.webdriver.common.action_chains import ActionChains
|
||
from webdriver_manager.chrome import ChromeDriverManager
|
||
from tqdm import tqdm
|
||
import yt_dlp
|
||
|
||
|
||
class CourseraDownloader:
|
||
def __init__(self, config_path="config.json"):
|
||
"""初始化下載器"""
|
||
with open(config_path, 'r', encoding='utf-8') as f:
|
||
self.config = json.load(f)
|
||
|
||
self.email = self.config.get('email')
|
||
self.password = self.config.get('password')
|
||
self.output_dir = self.config.get('output_dir', 'downloads')
|
||
self.download_videos = self.config.get('download_videos', True)
|
||
self.download_subtitles = self.config.get('download_subtitles', True)
|
||
self.download_resources = self.config.get('download_resources', True)
|
||
|
||
# 安全設定:降低被偵測為機器人的風險
|
||
self.delay_between_requests = self.config.get('delay_between_requests', 3) # 預設3秒
|
||
self.random_delay = self.config.get('random_delay', True) # 隨機延遲
|
||
self.max_retries = self.config.get('max_retries', 3) # 最大重試次數
|
||
|
||
self.session = requests.Session()
|
||
# 設定 requests session 的 headers,模擬真實瀏覽器
|
||
self.session.headers.update({
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||
'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
|
||
'Accept-Encoding': 'gzip, deflate, br',
|
||
'DNT': '1',
|
||
'Connection': 'keep-alive',
|
||
'Upgrade-Insecure-Requests': '1'
|
||
})
|
||
self.driver = None
|
||
|
||
# 建立輸出目錄
|
||
Path(self.output_dir).mkdir(parents=True, exist_ok=True)
|
||
|
||
def setup_driver(self):
|
||
"""設定 Selenium WebDriver"""
|
||
options = webdriver.ChromeOptions()
|
||
|
||
# 反機器人偵測設定
|
||
options.add_argument('--disable-blink-features=AutomationControlled')
|
||
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
|
||
options.add_experimental_option('useAutomationExtension', False)
|
||
|
||
# 添加更多瀏覽器偽裝參數
|
||
prefs = {
|
||
'profile.default_content_setting_values': {
|
||
'notifications': 2, # 禁用通知
|
||
},
|
||
'profile.managed_default_content_settings.images': 2, # 可選:禁用圖片加快速度
|
||
}
|
||
options.add_experimental_option('prefs', prefs)
|
||
|
||
# 使用真實的瀏覽器 User-Agent
|
||
options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
|
||
|
||
# 其他安全設定
|
||
options.add_argument('--disable-dev-shm-usage')
|
||
options.add_argument('--no-sandbox')
|
||
options.add_argument('--disable-gpu')
|
||
options.add_argument('--disable-software-rasterizer')
|
||
options.add_argument('--disable-extensions')
|
||
options.add_argument('--start-maximized')
|
||
|
||
# 語言設定
|
||
options.add_argument('--lang=zh-TW')
|
||
|
||
if self.config.get('headless', False):
|
||
options.add_argument('--headless=new') # 使用新版 headless 模式
|
||
options.add_argument('--window-size=1920,1080')
|
||
|
||
service = Service(ChromeDriverManager().install())
|
||
self.driver = webdriver.Chrome(service=service, options=options)
|
||
|
||
if not self.config.get('headless', False):
|
||
self.driver.maximize_window()
|
||
|
||
# 執行反偵測腳本
|
||
self.driver.execute_cdp_cmd('Network.setUserAgentOverride', {
|
||
"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||
})
|
||
|
||
# 隱藏多個 webdriver 特徵
|
||
self.driver.execute_script("""
|
||
Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
|
||
Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]});
|
||
Object.defineProperty(navigator, 'languages', {get: () => ['zh-TW', 'zh', 'en-US', 'en']});
|
||
window.chrome = {runtime: {}};
|
||
""")
|
||
|
||
def human_like_mouse_move(self, element):
|
||
"""模擬人類的滑鼠移動到元素"""
|
||
try:
|
||
actions = ActionChains(self.driver)
|
||
# 隨機移動到元素附近,然後移動到元素
|
||
actions.move_to_element_with_offset(element,
|
||
random.randint(-5, 5),
|
||
random.randint(-5, 5))
|
||
actions.pause(random.uniform(0.1, 0.3))
|
||
actions.move_to_element(element)
|
||
actions.perform()
|
||
time.sleep(random.uniform(0.2, 0.5))
|
||
except:
|
||
pass
|
||
|
||
def human_like_scroll(self):
|
||
"""模擬人類的滾動行為"""
|
||
scroll_amount = random.randint(100, 500)
|
||
self.driver.execute_script(f"window.scrollBy(0, {scroll_amount});")
|
||
time.sleep(random.uniform(0.5, 1.5))
|
||
|
||
def login(self):
|
||
"""登入 Coursera"""
|
||
print("正在登入 Coursera...")
|
||
self.driver.get("https://www.coursera.org/")
|
||
self.safe_delay(3) # 安全延遲
|
||
|
||
# 模擬人類瀏覽行為
|
||
self.human_like_scroll()
|
||
|
||
try:
|
||
# 點擊登入按鈕
|
||
login_button = WebDriverWait(self.driver, 10).until(
|
||
EC.element_to_be_clickable((By.LINK_TEXT, "Log In"))
|
||
)
|
||
|
||
# 模擬滑鼠移動到登入按鈕
|
||
self.human_like_mouse_move(login_button)
|
||
login_button.click()
|
||
self.safe_delay(2) # 安全延遲
|
||
|
||
# 等待登入表單載入
|
||
email_input = WebDriverWait(self.driver, 10).until(
|
||
EC.presence_of_element_located((By.ID, "email"))
|
||
)
|
||
password_input = self.driver.find_element(By.ID, "password")
|
||
|
||
# 模擬點擊輸入框
|
||
self.human_like_mouse_move(email_input)
|
||
email_input.click()
|
||
self.safe_delay(0.3)
|
||
|
||
# 逐字輸入,模擬真人打字(包含偶爾的停頓)
|
||
for i, char in enumerate(self.email):
|
||
email_input.send_keys(char)
|
||
# 模擬打字節奏變化
|
||
if i % 3 == 0:
|
||
time.sleep(random.uniform(0.1, 0.25)) # 偶爾停頓
|
||
else:
|
||
time.sleep(random.uniform(0.05, 0.15))
|
||
|
||
self.safe_delay(0.5)
|
||
|
||
# 移動到密碼輸入框
|
||
self.human_like_mouse_move(password_input)
|
||
password_input.click()
|
||
self.safe_delay(0.3)
|
||
|
||
for i, char in enumerate(self.password):
|
||
password_input.send_keys(char)
|
||
if i % 4 == 0:
|
||
time.sleep(random.uniform(0.1, 0.25))
|
||
else:
|
||
time.sleep(random.uniform(0.05, 0.15))
|
||
|
||
self.safe_delay(1)
|
||
|
||
# 點擊登入按鈕
|
||
submit_button = self.driver.find_element(By.CSS_SELECTOR, "button[type='submit']")
|
||
self.human_like_mouse_move(submit_button)
|
||
submit_button.click()
|
||
|
||
# 等待登入完成
|
||
self.safe_delay(5)
|
||
print("登入成功!")
|
||
|
||
# 取得 cookies 並同步到 requests session
|
||
cookies = self.driver.get_cookies()
|
||
for cookie in cookies:
|
||
self.session.cookies.set(cookie['name'], cookie['value'])
|
||
|
||
return True
|
||
except Exception as e:
|
||
print(f"登入失敗: {e}")
|
||
return False
|
||
|
||
def safe_delay(self, base_delay=None):
|
||
"""安全延遲:模擬人類操作,避免被偵測"""
|
||
if base_delay is None:
|
||
base_delay = self.delay_between_requests
|
||
|
||
if self.random_delay:
|
||
# 加入隨機延遲(±50%)
|
||
delay = base_delay * (0.5 + random.random())
|
||
else:
|
||
delay = base_delay
|
||
|
||
time.sleep(delay)
|
||
|
||
def sanitize_filename(self, filename):
|
||
"""清理檔案名稱,移除不合法字元"""
|
||
filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
|
||
filename = filename.strip()
|
||
return filename[:200] # 限制檔名長度
|
||
|
||
def download_file(self, url, filepath):
|
||
"""下載檔案"""
|
||
# 每次下載前加入隨機延遲
|
||
self.safe_delay()
|
||
|
||
try:
|
||
# 添加 Referer header,模擬從網頁點擊下載
|
||
headers = self.session.headers.copy()
|
||
headers['Referer'] = 'https://www.coursera.org/'
|
||
|
||
response = self.session.get(url, stream=True, timeout=30, headers=headers)
|
||
response.raise_for_status()
|
||
|
||
total_size = int(response.headers.get('content-length', 0))
|
||
|
||
with open(filepath, 'wb') as f, tqdm(
|
||
desc=os.path.basename(filepath),
|
||
total=total_size,
|
||
unit='iB',
|
||
unit_scale=True,
|
||
unit_divisor=1024,
|
||
) as pbar:
|
||
for chunk in response.iter_content(chunk_size=8192):
|
||
size = f.write(chunk)
|
||
pbar.update(size)
|
||
|
||
return True
|
||
except Exception as e:
|
||
print(f"下載失敗 {url}: {e}")
|
||
return False
|
||
|
||
def download_video_with_ytdlp(self, video_url, output_path):
|
||
"""使用 yt-dlp 下載影片"""
|
||
try:
|
||
# 從瀏覽器取得 cookies
|
||
cookies = {}
|
||
for cookie in self.driver.get_cookies():
|
||
cookies[cookie['name']] = cookie['value']
|
||
|
||
# 建立臨時 cookies 檔案
|
||
cookie_file = os.path.join(self.output_dir, 'cookies.txt')
|
||
with open(cookie_file, 'w') as f:
|
||
f.write('# Netscape HTTP Cookie File\n')
|
||
for name, value in cookies.items():
|
||
f.write(f'.coursera.org\tTRUE\t/\tTRUE\t0\t{name}\t{value}\n')
|
||
|
||
ydl_opts = {
|
||
'outtmpl': output_path,
|
||
'format': 'best',
|
||
'cookiefile': cookie_file,
|
||
'quiet': False,
|
||
'no_warnings': False,
|
||
}
|
||
|
||
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
||
ydl.download([video_url])
|
||
|
||
# 刪除臨時 cookies 檔案
|
||
if os.path.exists(cookie_file):
|
||
os.remove(cookie_file)
|
||
|
||
return True
|
||
except Exception as e:
|
||
print(f"影片下載失敗: {e}")
|
||
return False
|
||
|
||
def download_subtitles(self, video_element, output_dir):
|
||
"""下載字幕檔案"""
|
||
try:
|
||
# 尋找字幕連結
|
||
subtitle_tracks = video_element.find_elements(By.TAG_NAME, "track")
|
||
|
||
for track in subtitle_tracks:
|
||
src = track.get_attribute('src')
|
||
label = track.get_attribute('label') or 'subtitle'
|
||
|
||
if src:
|
||
subtitle_filename = f"{self.sanitize_filename(label)}.vtt"
|
||
subtitle_path = os.path.join(output_dir, subtitle_filename)
|
||
|
||
print(f"下載字幕: {label}")
|
||
self.download_file(src, subtitle_path)
|
||
|
||
return True
|
||
except Exception as e:
|
||
print(f"字幕下載失敗: {e}")
|
||
return False
|
||
|
||
def download_course(self, course_url):
|
||
"""下載整個課程"""
|
||
print(f"\n開始備份課程: {course_url}")
|
||
|
||
# 設定 WebDriver
|
||
if not self.driver:
|
||
self.setup_driver()
|
||
|
||
# 登入
|
||
if not self.login():
|
||
print("無法登入,停止下載")
|
||
return False
|
||
|
||
# 前往課程頁面
|
||
self.driver.get(course_url)
|
||
self.safe_delay(3)
|
||
|
||
# 模擬人類瀏覽行為
|
||
self.human_like_scroll()
|
||
self.safe_delay(2)
|
||
|
||
# 取得課程名稱
|
||
try:
|
||
course_title = self.driver.find_element(By.CSS_SELECTOR, "h1").text
|
||
course_dir = os.path.join(self.output_dir, self.sanitize_filename(course_title))
|
||
Path(course_dir).mkdir(parents=True, exist_ok=True)
|
||
print(f"課程名稱: {course_title}")
|
||
except:
|
||
course_dir = os.path.join(self.output_dir, "unknown_course")
|
||
Path(course_dir).mkdir(parents=True, exist_ok=True)
|
||
|
||
# 儲存課程資訊
|
||
course_info = {
|
||
'title': course_title if 'course_title' in locals() else 'Unknown',
|
||
'url': course_url,
|
||
'download_date': time.strftime('%Y-%m-%d %H:%M:%S')
|
||
}
|
||
|
||
with open(os.path.join(course_dir, 'course_info.json'), 'w', encoding='utf-8') as f:
|
||
json.dump(course_info, f, ensure_ascii=False, indent=2)
|
||
|
||
print(f"\n課程內容將儲存至: {course_dir}")
|
||
print("\n注意:由於 Coursera 網站結構複雜,您可能需要:")
|
||
print("1. 手動導航到課程的「週」或「模組」頁面")
|
||
print("2. 使用瀏覽器開發者工具找到影片和資源的實際 URL")
|
||
print("3. 根據您的具體課程結構調整程式碼")
|
||
print("\n程式將保持瀏覽器開啟 60 秒,請手動檢查頁面結構...")
|
||
|
||
time.sleep(60)
|
||
|
||
return True
|
||
|
||
def close(self):
|
||
"""關閉瀏覽器"""
|
||
if self.driver:
|
||
self.driver.quit()
|
||
|
||
|
||
def main():
|
||
"""主程式"""
|
||
print("=" * 60)
|
||
print("Coursera 課程備份工具")
|
||
print("=" * 60)
|
||
|
||
# 檢查配置檔案
|
||
if not os.path.exists('config.json'):
|
||
print("\n錯誤:找不到 config.json 檔案")
|
||
print("請參考 config.example.json 建立您的設定檔")
|
||
return
|
||
|
||
try:
|
||
downloader = CourseraDownloader()
|
||
|
||
# 取得課程 URL
|
||
if 'course_urls' in downloader.config and downloader.config['course_urls']:
|
||
for course_url in downloader.config['course_urls']:
|
||
downloader.download_course(course_url)
|
||
else:
|
||
print("\n請在 config.json 中設定要下載的課程 URL")
|
||
|
||
except KeyboardInterrupt:
|
||
print("\n\n使用者中斷下載")
|
||
except Exception as e:
|
||
print(f"\n發生錯誤: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
finally:
|
||
if 'downloader' in locals():
|
||
downloader.close()
|
||
print("\n程式結束")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|