Update

2025-12-16 15:07:28 +08:00
parent 788f6db90a
commit e25629ed6d
5 changed files with 725 additions and 1 deletions
--- a/coursera_downloader.py
+++ b/coursera_downloader.py
@@ -0,0 +1,411 @@
+"""
+Coursera 課程備份工具
+功能：下載課程影片、文字內容和字幕
+"""
+
+import os
+import json
+import time
+import re
+import random
+from pathlib import Path
+from urllib.parse import urljoin, urlparse
+import requests
+from bs4 import BeautifulSoup
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.common.action_chains import ActionChains
+from webdriver_manager.chrome import ChromeDriverManager
+from tqdm import tqdm
+import yt_dlp
+
+
+class CourseraDownloader:
+    def __init__(self, config_path="config.json"):
+        """初始化下載器"""
+        with open(config_path, 'r', encoding='utf-8') as f:
+            self.config = json.load(f)
+        
+        self.email = self.config.get('email')
+        self.password = self.config.get('password')
+        self.output_dir = self.config.get('output_dir', 'downloads')
+        self.download_videos = self.config.get('download_videos', True)
+        self.download_subtitles = self.config.get('download_subtitles', True)
+        self.download_resources = self.config.get('download_resources', True)
+        
+        # 安全設定：降低被偵測為機器人的風險
+        self.delay_between_requests = self.config.get('delay_between_requests', 3)  # 預設3秒
+        self.random_delay = self.config.get('random_delay', True)  # 隨機延遲
+        self.max_retries = self.config.get('max_retries', 3)  # 最大重試次數
+        
+        self.session = requests.Session()
+        # 設定 requests session 的 headers，模擬真實瀏覽器
+        self.session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+            'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
+            'Accept-Encoding': 'gzip, deflate, br',
+            'DNT': '1',
+            'Connection': 'keep-alive',
+            'Upgrade-Insecure-Requests': '1'
+        })
+        self.driver = None
+        
+        # 建立輸出目錄
+        Path(self.output_dir).mkdir(parents=True, exist_ok=True)
+    
+    def setup_driver(self):
+        """設定 Selenium WebDriver"""
+        options = webdriver.ChromeOptions()
+        
+        # 反機器人偵測設定
+        options.add_argument('--disable-blink-features=AutomationControlled')
+        options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
+        options.add_experimental_option('useAutomationExtension', False)
+        
+        # 添加更多瀏覽器偽裝參數
+        prefs = {
+            'profile.default_content_setting_values': {
+                'notifications': 2,  # 禁用通知
+            },
+            'profile.managed_default_content_settings.images': 2,  # 可選：禁用圖片加快速度
+        }
+        options.add_experimental_option('prefs', prefs)
+        
+        # 使用真實的瀏覽器 User-Agent
+        options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
+        
+        # 其他安全設定
+        options.add_argument('--disable-dev-shm-usage')
+        options.add_argument('--no-sandbox')
+        options.add_argument('--disable-gpu')
+        options.add_argument('--disable-software-rasterizer')
+        options.add_argument('--disable-extensions')
+        options.add_argument('--start-maximized')
+        
+        # 語言設定
+        options.add_argument('--lang=zh-TW')
+        
+        if self.config.get('headless', False):
+            options.add_argument('--headless=new')  # 使用新版 headless 模式
+            options.add_argument('--window-size=1920,1080')
+        
+        service = Service(ChromeDriverManager().install())
+        self.driver = webdriver.Chrome(service=service, options=options)
+        
+        if not self.config.get('headless', False):
+            self.driver.maximize_window()
+        
+        # 執行反偵測腳本
+        self.driver.execute_cdp_cmd('Network.setUserAgentOverride', {
+            "userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
+        })
+        
+        # 隱藏多個 webdriver 特徵
+        self.driver.execute_script("""
+            Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
+            Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]});
+            Object.defineProperty(navigator, 'languages', {get: () => ['zh-TW', 'zh', 'en-US', 'en']});
+            window.chrome = {runtime: {}};
+        """)
+    
+    def human_like_mouse_move(self, element):
+        """模擬人類的滑鼠移動到元素"""
+        try:
+            actions = ActionChains(self.driver)
+            # 隨機移動到元素附近，然後移動到元素
+            actions.move_to_element_with_offset(element, 
+                                               random.randint(-5, 5), 
+                                               random.randint(-5, 5))
+            actions.pause(random.uniform(0.1, 0.3))
+            actions.move_to_element(element)
+            actions.perform()
+            time.sleep(random.uniform(0.2, 0.5))
+        except:
+            pass
+    
+    def human_like_scroll(self):
+        """模擬人類的滾動行為"""
+        scroll_amount = random.randint(100, 500)
+        self.driver.execute_script(f"window.scrollBy(0, {scroll_amount});")
+        time.sleep(random.uniform(0.5, 1.5))
+    
+    def login(self):
+        """登入 Coursera"""
+        print("正在登入 Coursera...")
+        self.driver.get("https://www.coursera.org/")
+        self.safe_delay(3)  # 安全延遲
+        
+        # 模擬人類瀏覽行為
+        self.human_like_scroll()
+        
+        try:
+            # 點擊登入按鈕
+            login_button = WebDriverWait(self.driver, 10).until(
+                EC.element_to_be_clickable((By.LINK_TEXT, "Log In"))
+            )
+            
+            # 模擬滑鼠移動到登入按鈕
+            self.human_like_mouse_move(login_button)
+            login_button.click()
+            self.safe_delay(2)  # 安全延遲
+            
+            # 等待登入表單載入
+            email_input = WebDriverWait(self.driver, 10).until(
+                EC.presence_of_element_located((By.ID, "email"))
+            )
+            password_input = self.driver.find_element(By.ID, "password")
+            
+            # 模擬點擊輸入框
+            self.human_like_mouse_move(email_input)
+            email_input.click()
+            self.safe_delay(0.3)
+            
+            # 逐字輸入，模擬真人打字（包含偶爾的停頓）
+            for i, char in enumerate(self.email):
+                email_input.send_keys(char)
+                # 模擬打字節奏變化
+                if i % 3 == 0:
+                    time.sleep(random.uniform(0.1, 0.25))  # 偶爾停頓
+                else:
+                    time.sleep(random.uniform(0.05, 0.15))
+            
+            self.safe_delay(0.5)
+            
+            # 移動到密碼輸入框
+            self.human_like_mouse_move(password_input)
+            password_input.click()
+            self.safe_delay(0.3)
+            
+            for i, char in enumerate(self.password):
+                password_input.send_keys(char)
+                if i % 4 == 0:
+                    time.sleep(random.uniform(0.1, 0.25))
+                else:
+                    time.sleep(random.uniform(0.05, 0.15))
+            
+            self.safe_delay(1)
+            
+            # 點擊登入按鈕
+            submit_button = self.driver.find_element(By.CSS_SELECTOR, "button[type='submit']")
+            self.human_like_mouse_move(submit_button)
+            submit_button.click()
+            
+            # 等待登入完成
+            self.safe_delay(5)
+            print("登入成功！")
+            
+            # 取得 cookies 並同步到 requests session
+            cookies = self.driver.get_cookies()
+            for cookie in cookies:
+                self.session.cookies.set(cookie['name'], cookie['value'])
+            
+            return True
+        except Exception as e:
+            print(f"登入失敗: {e}")
+            return False
+    
+    def safe_delay(self, base_delay=None):
+        """安全延遲：模擬人類操作，避免被偵測"""
+        if base_delay is None:
+            base_delay = self.delay_between_requests
+        
+        if self.random_delay:
+            # 加入隨機延遲（±50%）
+            delay = base_delay * (0.5 + random.random())
+        else:
+            delay = base_delay
+        
+        time.sleep(delay)
+    
+    def sanitize_filename(self, filename):
+        """清理檔案名稱，移除不合法字元"""
+        filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
+        filename = filename.strip()
+        return filename[:200]  # 限制檔名長度
+    
+    def download_file(self, url, filepath):
+        """下載檔案"""
+        # 每次下載前加入隨機延遲
+        self.safe_delay()
+        
+        try:
+            # 添加 Referer header，模擬從網頁點擊下載
+            headers = self.session.headers.copy()
+            headers['Referer'] = 'https://www.coursera.org/'
+            
+            response = self.session.get(url, stream=True, timeout=30, headers=headers)
+            response.raise_for_status()
+            
+            total_size = int(response.headers.get('content-length', 0))
+            
+            with open(filepath, 'wb') as f, tqdm(
+                desc=os.path.basename(filepath),
+                total=total_size,
+                unit='iB',
+                unit_scale=True,
+                unit_divisor=1024,
+            ) as pbar:
+                for chunk in response.iter_content(chunk_size=8192):
+                    size = f.write(chunk)
+                    pbar.update(size)
+            
+            return True
+        except Exception as e:
+            print(f"下載失敗 {url}: {e}")
+            return False
+    
+    def download_video_with_ytdlp(self, video_url, output_path):
+        """使用 yt-dlp 下載影片"""
+        try:
+            # 從瀏覽器取得 cookies
+            cookies = {}
+            for cookie in self.driver.get_cookies():
+                cookies[cookie['name']] = cookie['value']
+            
+            # 建立臨時 cookies 檔案
+            cookie_file = os.path.join(self.output_dir, 'cookies.txt')
+            with open(cookie_file, 'w') as f:
+                f.write('# Netscape HTTP Cookie File\n')
+                for name, value in cookies.items():
+                    f.write(f'.coursera.org\tTRUE\t/\tTRUE\t0\t{name}\t{value}\n')
+            
+            ydl_opts = {
+                'outtmpl': output_path,
+                'format': 'best',
+                'cookiefile': cookie_file,
+                'quiet': False,
+                'no_warnings': False,
+            }
+            
+            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+                ydl.download([video_url])
+            
+            # 刪除臨時 cookies 檔案
+            if os.path.exists(cookie_file):
+                os.remove(cookie_file)
+            
+            return True
+        except Exception as e:
+            print(f"影片下載失敗: {e}")
+            return False
+    
+    def download_subtitles(self, video_element, output_dir):
+        """下載字幕檔案"""
+        try:
+            # 尋找字幕連結
+            subtitle_tracks = video_element.find_elements(By.TAG_NAME, "track")
+            
+            for track in subtitle_tracks:
+                src = track.get_attribute('src')
+                label = track.get_attribute('label') or 'subtitle'
+                
+                if src:
+                    subtitle_filename = f"{self.sanitize_filename(label)}.vtt"
+                    subtitle_path = os.path.join(output_dir, subtitle_filename)
+                    
+                    print(f"下載字幕: {label}")
+                    self.download_file(src, subtitle_path)
+            
+            return True
+        except Exception as e:
+            print(f"字幕下載失敗: {e}")
+            return False
+    
+    def download_course(self, course_url):
+        """下載整個課程"""
+        print(f"\n開始備份課程: {course_url}")
+        
+        # 設定 WebDriver
+        if not self.driver:
+            self.setup_driver()
+        
+        # 登入
+        if not self.login():
+            print("無法登入，停止下載")
+            return False
+        
+        # 前往課程頁面
+        self.driver.get(course_url)
+        self.safe_delay(3)
+        
+        # 模擬人類瀏覽行為
+        self.human_like_scroll()
+        self.safe_delay(2)
+        
+        # 取得課程名稱
+        try:
+            course_title = self.driver.find_element(By.CSS_SELECTOR, "h1").text
+            course_dir = os.path.join(self.output_dir, self.sanitize_filename(course_title))
+            Path(course_dir).mkdir(parents=True, exist_ok=True)
+            print(f"課程名稱: {course_title}")
+        except:
+            course_dir = os.path.join(self.output_dir, "unknown_course")
+            Path(course_dir).mkdir(parents=True, exist_ok=True)
+        
+        # 儲存課程資訊
+        course_info = {
+            'title': course_title if 'course_title' in locals() else 'Unknown',
+            'url': course_url,
+            'download_date': time.strftime('%Y-%m-%d %H:%M:%S')
+        }
+        
+        with open(os.path.join(course_dir, 'course_info.json'), 'w', encoding='utf-8') as f:
+            json.dump(course_info, f, ensure_ascii=False, indent=2)
+        
+        print(f"\n課程內容將儲存至: {course_dir}")
+        print("\n注意：由於 Coursera 網站結構複雜，您可能需要：")
+        print("1. 手動導航到課程的「週」或「模組」頁面")
+        print("2. 使用瀏覽器開發者工具找到影片和資源的實際 URL")
+        print("3. 根據您的具體課程結構調整程式碼")
+        print("\n程式將保持瀏覽器開啟 60 秒，請手動檢查頁面結構...")
+        
+        time.sleep(60)
+        
+        return True
+    
+    def close(self):
+        """關閉瀏覽器"""
+        if self.driver:
+            self.driver.quit()
+
+
+def main():
+    """主程式"""
+    print("=" * 60)
+    print("Coursera 課程備份工具")
+    print("=" * 60)
+    
+    # 檢查配置檔案
+    if not os.path.exists('config.json'):
+        print("\n錯誤：找不到 config.json 檔案")
+        print("請參考 config.example.json 建立您的設定檔")
+        return
+    
+    try:
+        downloader = CourseraDownloader()
+        
+        # 取得課程 URL
+        if 'course_urls' in downloader.config and downloader.config['course_urls']:
+            for course_url in downloader.config['course_urls']:
+                downloader.download_course(course_url)
+        else:
+            print("\n請在 config.json 中設定要下載的課程 URL")
+        
+    except KeyboardInterrupt:
+        print("\n\n使用者中斷下載")
+    except Exception as e:
+        print(f"\n發生錯誤: {e}")
+        import traceback
+        traceback.print_exc()
+    finally:
+        if 'downloader' in locals():
+            downloader.close()
+        print("\n程式結束")
+
+
+if __name__ == "__main__":
+    main()