""" Coursera 課程備份工具 功能:下載課程影片、文字內容和字幕 """ import os import json import time import re import random from pathlib import Path from urllib.parse import urljoin, urlparse import requests from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.action_chains import ActionChains from webdriver_manager.chrome import ChromeDriverManager from tqdm import tqdm import yt_dlp class CourseraDownloader: def __init__(self, config_path="config.json"): """初始化下載器""" with open(config_path, 'r', encoding='utf-8') as f: self.config = json.load(f) self.email = self.config.get('email') self.password = self.config.get('password') self.output_dir = self.config.get('output_dir', 'downloads') self.download_videos = self.config.get('download_videos', True) self.download_subtitles = self.config.get('download_subtitles', True) self.download_resources = self.config.get('download_resources', True) # 安全設定:降低被偵測為機器人的風險 self.delay_between_requests = self.config.get('delay_between_requests', 3) # 預設3秒 self.random_delay = self.config.get('random_delay', True) # 隨機延遲 self.max_retries = self.config.get('max_retries', 3) # 最大重試次數 self.session = requests.Session() # 設定 requests session 的 headers,模擬真實瀏覽器 self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7', 'Accept-Encoding': 'gzip, deflate, br', 'DNT': '1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1' }) self.driver = None # 建立輸出目錄 Path(self.output_dir).mkdir(parents=True, exist_ok=True) def setup_driver(self): """設定 Selenium WebDriver""" options = webdriver.ChromeOptions() # 反機器人偵測設定 options.add_argument('--disable-blink-features=AutomationControlled') options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"]) options.add_experimental_option('useAutomationExtension', False) # 添加更多瀏覽器偽裝參數 prefs = { 'profile.default_content_setting_values': { 'notifications': 2, # 禁用通知 }, 'profile.managed_default_content_settings.images': 2, # 可選:禁用圖片加快速度 } options.add_experimental_option('prefs', prefs) # 使用真實的瀏覽器 User-Agent options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36') # 其他安全設定 options.add_argument('--disable-dev-shm-usage') options.add_argument('--no-sandbox') options.add_argument('--disable-gpu') options.add_argument('--disable-software-rasterizer') options.add_argument('--disable-extensions') options.add_argument('--start-maximized') # 語言設定 options.add_argument('--lang=zh-TW') if self.config.get('headless', False): options.add_argument('--headless=new') # 使用新版 headless 模式 options.add_argument('--window-size=1920,1080') service = Service(ChromeDriverManager().install()) self.driver = webdriver.Chrome(service=service, options=options) if not self.config.get('headless', False): self.driver.maximize_window() # 執行反偵測腳本 self.driver.execute_cdp_cmd('Network.setUserAgentOverride', { "userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' }) # 隱藏多個 webdriver 特徵 self.driver.execute_script(""" Object.defineProperty(navigator, 'webdriver', {get: () => undefined}); Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]}); Object.defineProperty(navigator, 'languages', {get: () => ['zh-TW', 'zh', 'en-US', 'en']}); window.chrome = {runtime: {}}; """) def human_like_mouse_move(self, element): """模擬人類的滑鼠移動到元素""" try: actions = ActionChains(self.driver) # 隨機移動到元素附近,然後移動到元素 actions.move_to_element_with_offset(element, random.randint(-5, 5), random.randint(-5, 5)) actions.pause(random.uniform(0.1, 0.3)) actions.move_to_element(element) actions.perform() time.sleep(random.uniform(0.2, 0.5)) except: pass def human_like_scroll(self): """模擬人類的滾動行為""" scroll_amount = random.randint(100, 500) self.driver.execute_script(f"window.scrollBy(0, {scroll_amount});") time.sleep(random.uniform(0.5, 1.5)) def login(self): """登入 Coursera""" print("正在登入 Coursera...") self.driver.get("https://www.coursera.org/") self.safe_delay(3) # 安全延遲 # 模擬人類瀏覽行為 self.human_like_scroll() try: # 點擊登入按鈕 login_button = WebDriverWait(self.driver, 10).until( EC.element_to_be_clickable((By.LINK_TEXT, "Log In")) ) # 模擬滑鼠移動到登入按鈕 self.human_like_mouse_move(login_button) login_button.click() self.safe_delay(2) # 安全延遲 # 等待登入表單載入 email_input = WebDriverWait(self.driver, 10).until( EC.presence_of_element_located((By.ID, "email")) ) password_input = self.driver.find_element(By.ID, "password") # 模擬點擊輸入框 self.human_like_mouse_move(email_input) email_input.click() self.safe_delay(0.3) # 逐字輸入,模擬真人打字(包含偶爾的停頓) for i, char in enumerate(self.email): email_input.send_keys(char) # 模擬打字節奏變化 if i % 3 == 0: time.sleep(random.uniform(0.1, 0.25)) # 偶爾停頓 else: time.sleep(random.uniform(0.05, 0.15)) self.safe_delay(0.5) # 移動到密碼輸入框 self.human_like_mouse_move(password_input) password_input.click() self.safe_delay(0.3) for i, char in enumerate(self.password): password_input.send_keys(char) if i % 4 == 0: time.sleep(random.uniform(0.1, 0.25)) else: time.sleep(random.uniform(0.05, 0.15)) self.safe_delay(1) # 點擊登入按鈕 submit_button = self.driver.find_element(By.CSS_SELECTOR, "button[type='submit']") self.human_like_mouse_move(submit_button) submit_button.click() # 等待登入完成 self.safe_delay(5) print("登入成功!") # 取得 cookies 並同步到 requests session cookies = self.driver.get_cookies() for cookie in cookies: self.session.cookies.set(cookie['name'], cookie['value']) return True except Exception as e: print(f"登入失敗: {e}") return False def safe_delay(self, base_delay=None): """安全延遲:模擬人類操作,避免被偵測""" if base_delay is None: base_delay = self.delay_between_requests if self.random_delay: # 加入隨機延遲(±50%) delay = base_delay * (0.5 + random.random()) else: delay = base_delay time.sleep(delay) def sanitize_filename(self, filename): """清理檔案名稱,移除不合法字元""" filename = re.sub(r'[<>:"/\\|?*]', '_', filename) filename = filename.strip() return filename[:200] # 限制檔名長度 def download_file(self, url, filepath): """下載檔案""" # 每次下載前加入隨機延遲 self.safe_delay() try: # 添加 Referer header,模擬從網頁點擊下載 headers = self.session.headers.copy() headers['Referer'] = 'https://www.coursera.org/' response = self.session.get(url, stream=True, timeout=30, headers=headers) response.raise_for_status() total_size = int(response.headers.get('content-length', 0)) with open(filepath, 'wb') as f, tqdm( desc=os.path.basename(filepath), total=total_size, unit='iB', unit_scale=True, unit_divisor=1024, ) as pbar: for chunk in response.iter_content(chunk_size=8192): size = f.write(chunk) pbar.update(size) return True except Exception as e: print(f"下載失敗 {url}: {e}") return False def download_video_with_ytdlp(self, video_url, output_path): """使用 yt-dlp 下載影片""" try: # 從瀏覽器取得 cookies cookies = {} for cookie in self.driver.get_cookies(): cookies[cookie['name']] = cookie['value'] # 建立臨時 cookies 檔案 cookie_file = os.path.join(self.output_dir, 'cookies.txt') with open(cookie_file, 'w') as f: f.write('# Netscape HTTP Cookie File\n') for name, value in cookies.items(): f.write(f'.coursera.org\tTRUE\t/\tTRUE\t0\t{name}\t{value}\n') ydl_opts = { 'outtmpl': output_path, 'format': 'best', 'cookiefile': cookie_file, 'quiet': False, 'no_warnings': False, } with yt_dlp.YoutubeDL(ydl_opts) as ydl: ydl.download([video_url]) # 刪除臨時 cookies 檔案 if os.path.exists(cookie_file): os.remove(cookie_file) return True except Exception as e: print(f"影片下載失敗: {e}") return False def download_subtitles(self, video_element, output_dir): """下載字幕檔案""" try: # 尋找字幕連結 subtitle_tracks = video_element.find_elements(By.TAG_NAME, "track") for track in subtitle_tracks: src = track.get_attribute('src') label = track.get_attribute('label') or 'subtitle' if src: subtitle_filename = f"{self.sanitize_filename(label)}.vtt" subtitle_path = os.path.join(output_dir, subtitle_filename) print(f"下載字幕: {label}") self.download_file(src, subtitle_path) return True except Exception as e: print(f"字幕下載失敗: {e}") return False def download_course(self, course_url): """下載整個課程""" print(f"\n開始備份課程: {course_url}") # 設定 WebDriver if not self.driver: self.setup_driver() # 登入 if not self.login(): print("無法登入,停止下載") return False # 前往課程頁面 self.driver.get(course_url) self.safe_delay(3) # 模擬人類瀏覽行為 self.human_like_scroll() self.safe_delay(2) # 取得課程名稱 try: course_title = self.driver.find_element(By.CSS_SELECTOR, "h1").text course_dir = os.path.join(self.output_dir, self.sanitize_filename(course_title)) Path(course_dir).mkdir(parents=True, exist_ok=True) print(f"課程名稱: {course_title}") except: course_dir = os.path.join(self.output_dir, "unknown_course") Path(course_dir).mkdir(parents=True, exist_ok=True) # 儲存課程資訊 course_info = { 'title': course_title if 'course_title' in locals() else 'Unknown', 'url': course_url, 'download_date': time.strftime('%Y-%m-%d %H:%M:%S') } with open(os.path.join(course_dir, 'course_info.json'), 'w', encoding='utf-8') as f: json.dump(course_info, f, ensure_ascii=False, indent=2) print(f"\n課程內容將儲存至: {course_dir}") print("\n注意:由於 Coursera 網站結構複雜,您可能需要:") print("1. 手動導航到課程的「週」或「模組」頁面") print("2. 使用瀏覽器開發者工具找到影片和資源的實際 URL") print("3. 根據您的具體課程結構調整程式碼") print("\n程式將保持瀏覽器開啟 60 秒,請手動檢查頁面結構...") time.sleep(60) return True def close(self): """關閉瀏覽器""" if self.driver: self.driver.quit() def main(): """主程式""" print("=" * 60) print("Coursera 課程備份工具") print("=" * 60) # 檢查配置檔案 if not os.path.exists('config.json'): print("\n錯誤:找不到 config.json 檔案") print("請參考 config.example.json 建立您的設定檔") return try: downloader = CourseraDownloader() # 取得課程 URL if 'course_urls' in downloader.config and downloader.config['course_urls']: for course_url in downloader.config['course_urls']: downloader.download_course(course_url) else: print("\n請在 config.json 中設定要下載的課程 URL") except KeyboardInterrupt: print("\n\n使用者中斷下載") except Exception as e: print(f"\n發生錯誤: {e}") import traceback traceback.print_exc() finally: if 'downloader' in locals(): downloader.close() print("\n程式結束") if __name__ == "__main__": main()