This commit is contained in:
ChenKaiLiuG
2025-12-16 15:07:28 +08:00
parent 788f6db90a
commit e25629ed6d
5 changed files with 725 additions and 1 deletions

411
coursera_downloader.py Normal file
View File

@@ -0,0 +1,411 @@
"""
Coursera 課程備份工具
功能:下載課程影片、文字內容和字幕
"""
import os
import json
import time
import re
import random
from pathlib import Path
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
from webdriver_manager.chrome import ChromeDriverManager
from tqdm import tqdm
import yt_dlp
class CourseraDownloader:
def __init__(self, config_path="config.json"):
"""初始化下載器"""
with open(config_path, 'r', encoding='utf-8') as f:
self.config = json.load(f)
self.email = self.config.get('email')
self.password = self.config.get('password')
self.output_dir = self.config.get('output_dir', 'downloads')
self.download_videos = self.config.get('download_videos', True)
self.download_subtitles = self.config.get('download_subtitles', True)
self.download_resources = self.config.get('download_resources', True)
# 安全設定:降低被偵測為機器人的風險
self.delay_between_requests = self.config.get('delay_between_requests', 3) # 預設3秒
self.random_delay = self.config.get('random_delay', True) # 隨機延遲
self.max_retries = self.config.get('max_retries', 3) # 最大重試次數
self.session = requests.Session()
# 設定 requests session 的 headers模擬真實瀏覽器
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
})
self.driver = None
# 建立輸出目錄
Path(self.output_dir).mkdir(parents=True, exist_ok=True)
def setup_driver(self):
"""設定 Selenium WebDriver"""
options = webdriver.ChromeOptions()
# 反機器人偵測設定
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
options.add_experimental_option('useAutomationExtension', False)
# 添加更多瀏覽器偽裝參數
prefs = {
'profile.default_content_setting_values': {
'notifications': 2, # 禁用通知
},
'profile.managed_default_content_settings.images': 2, # 可選:禁用圖片加快速度
}
options.add_experimental_option('prefs', prefs)
# 使用真實的瀏覽器 User-Agent
options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
# 其他安全設定
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--no-sandbox')
options.add_argument('--disable-gpu')
options.add_argument('--disable-software-rasterizer')
options.add_argument('--disable-extensions')
options.add_argument('--start-maximized')
# 語言設定
options.add_argument('--lang=zh-TW')
if self.config.get('headless', False):
options.add_argument('--headless=new') # 使用新版 headless 模式
options.add_argument('--window-size=1920,1080')
service = Service(ChromeDriverManager().install())
self.driver = webdriver.Chrome(service=service, options=options)
if not self.config.get('headless', False):
self.driver.maximize_window()
# 執行反偵測腳本
self.driver.execute_cdp_cmd('Network.setUserAgentOverride', {
"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
})
# 隱藏多個 webdriver 特徵
self.driver.execute_script("""
Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]});
Object.defineProperty(navigator, 'languages', {get: () => ['zh-TW', 'zh', 'en-US', 'en']});
window.chrome = {runtime: {}};
""")
def human_like_mouse_move(self, element):
"""模擬人類的滑鼠移動到元素"""
try:
actions = ActionChains(self.driver)
# 隨機移動到元素附近,然後移動到元素
actions.move_to_element_with_offset(element,
random.randint(-5, 5),
random.randint(-5, 5))
actions.pause(random.uniform(0.1, 0.3))
actions.move_to_element(element)
actions.perform()
time.sleep(random.uniform(0.2, 0.5))
except:
pass
def human_like_scroll(self):
"""模擬人類的滾動行為"""
scroll_amount = random.randint(100, 500)
self.driver.execute_script(f"window.scrollBy(0, {scroll_amount});")
time.sleep(random.uniform(0.5, 1.5))
def login(self):
"""登入 Coursera"""
print("正在登入 Coursera...")
self.driver.get("https://www.coursera.org/")
self.safe_delay(3) # 安全延遲
# 模擬人類瀏覽行為
self.human_like_scroll()
try:
# 點擊登入按鈕
login_button = WebDriverWait(self.driver, 10).until(
EC.element_to_be_clickable((By.LINK_TEXT, "Log In"))
)
# 模擬滑鼠移動到登入按鈕
self.human_like_mouse_move(login_button)
login_button.click()
self.safe_delay(2) # 安全延遲
# 等待登入表單載入
email_input = WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.ID, "email"))
)
password_input = self.driver.find_element(By.ID, "password")
# 模擬點擊輸入框
self.human_like_mouse_move(email_input)
email_input.click()
self.safe_delay(0.3)
# 逐字輸入,模擬真人打字(包含偶爾的停頓)
for i, char in enumerate(self.email):
email_input.send_keys(char)
# 模擬打字節奏變化
if i % 3 == 0:
time.sleep(random.uniform(0.1, 0.25)) # 偶爾停頓
else:
time.sleep(random.uniform(0.05, 0.15))
self.safe_delay(0.5)
# 移動到密碼輸入框
self.human_like_mouse_move(password_input)
password_input.click()
self.safe_delay(0.3)
for i, char in enumerate(self.password):
password_input.send_keys(char)
if i % 4 == 0:
time.sleep(random.uniform(0.1, 0.25))
else:
time.sleep(random.uniform(0.05, 0.15))
self.safe_delay(1)
# 點擊登入按鈕
submit_button = self.driver.find_element(By.CSS_SELECTOR, "button[type='submit']")
self.human_like_mouse_move(submit_button)
submit_button.click()
# 等待登入完成
self.safe_delay(5)
print("登入成功!")
# 取得 cookies 並同步到 requests session
cookies = self.driver.get_cookies()
for cookie in cookies:
self.session.cookies.set(cookie['name'], cookie['value'])
return True
except Exception as e:
print(f"登入失敗: {e}")
return False
def safe_delay(self, base_delay=None):
"""安全延遲:模擬人類操作,避免被偵測"""
if base_delay is None:
base_delay = self.delay_between_requests
if self.random_delay:
# 加入隨機延遲±50%
delay = base_delay * (0.5 + random.random())
else:
delay = base_delay
time.sleep(delay)
def sanitize_filename(self, filename):
"""清理檔案名稱,移除不合法字元"""
filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
filename = filename.strip()
return filename[:200] # 限制檔名長度
def download_file(self, url, filepath):
"""下載檔案"""
# 每次下載前加入隨機延遲
self.safe_delay()
try:
# 添加 Referer header模擬從網頁點擊下載
headers = self.session.headers.copy()
headers['Referer'] = 'https://www.coursera.org/'
response = self.session.get(url, stream=True, timeout=30, headers=headers)
response.raise_for_status()
total_size = int(response.headers.get('content-length', 0))
with open(filepath, 'wb') as f, tqdm(
desc=os.path.basename(filepath),
total=total_size,
unit='iB',
unit_scale=True,
unit_divisor=1024,
) as pbar:
for chunk in response.iter_content(chunk_size=8192):
size = f.write(chunk)
pbar.update(size)
return True
except Exception as e:
print(f"下載失敗 {url}: {e}")
return False
def download_video_with_ytdlp(self, video_url, output_path):
"""使用 yt-dlp 下載影片"""
try:
# 從瀏覽器取得 cookies
cookies = {}
for cookie in self.driver.get_cookies():
cookies[cookie['name']] = cookie['value']
# 建立臨時 cookies 檔案
cookie_file = os.path.join(self.output_dir, 'cookies.txt')
with open(cookie_file, 'w') as f:
f.write('# Netscape HTTP Cookie File\n')
for name, value in cookies.items():
f.write(f'.coursera.org\tTRUE\t/\tTRUE\t0\t{name}\t{value}\n')
ydl_opts = {
'outtmpl': output_path,
'format': 'best',
'cookiefile': cookie_file,
'quiet': False,
'no_warnings': False,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([video_url])
# 刪除臨時 cookies 檔案
if os.path.exists(cookie_file):
os.remove(cookie_file)
return True
except Exception as e:
print(f"影片下載失敗: {e}")
return False
def download_subtitles(self, video_element, output_dir):
"""下載字幕檔案"""
try:
# 尋找字幕連結
subtitle_tracks = video_element.find_elements(By.TAG_NAME, "track")
for track in subtitle_tracks:
src = track.get_attribute('src')
label = track.get_attribute('label') or 'subtitle'
if src:
subtitle_filename = f"{self.sanitize_filename(label)}.vtt"
subtitle_path = os.path.join(output_dir, subtitle_filename)
print(f"下載字幕: {label}")
self.download_file(src, subtitle_path)
return True
except Exception as e:
print(f"字幕下載失敗: {e}")
return False
def download_course(self, course_url):
"""下載整個課程"""
print(f"\n開始備份課程: {course_url}")
# 設定 WebDriver
if not self.driver:
self.setup_driver()
# 登入
if not self.login():
print("無法登入,停止下載")
return False
# 前往課程頁面
self.driver.get(course_url)
self.safe_delay(3)
# 模擬人類瀏覽行為
self.human_like_scroll()
self.safe_delay(2)
# 取得課程名稱
try:
course_title = self.driver.find_element(By.CSS_SELECTOR, "h1").text
course_dir = os.path.join(self.output_dir, self.sanitize_filename(course_title))
Path(course_dir).mkdir(parents=True, exist_ok=True)
print(f"課程名稱: {course_title}")
except:
course_dir = os.path.join(self.output_dir, "unknown_course")
Path(course_dir).mkdir(parents=True, exist_ok=True)
# 儲存課程資訊
course_info = {
'title': course_title if 'course_title' in locals() else 'Unknown',
'url': course_url,
'download_date': time.strftime('%Y-%m-%d %H:%M:%S')
}
with open(os.path.join(course_dir, 'course_info.json'), 'w', encoding='utf-8') as f:
json.dump(course_info, f, ensure_ascii=False, indent=2)
print(f"\n課程內容將儲存至: {course_dir}")
print("\n注意:由於 Coursera 網站結構複雜,您可能需要:")
print("1. 手動導航到課程的「週」或「模組」頁面")
print("2. 使用瀏覽器開發者工具找到影片和資源的實際 URL")
print("3. 根據您的具體課程結構調整程式碼")
print("\n程式將保持瀏覽器開啟 60 秒,請手動檢查頁面結構...")
time.sleep(60)
return True
def close(self):
"""關閉瀏覽器"""
if self.driver:
self.driver.quit()
def main():
"""主程式"""
print("=" * 60)
print("Coursera 課程備份工具")
print("=" * 60)
# 檢查配置檔案
if not os.path.exists('config.json'):
print("\n錯誤:找不到 config.json 檔案")
print("請參考 config.example.json 建立您的設定檔")
return
try:
downloader = CourseraDownloader()
# 取得課程 URL
if 'course_urls' in downloader.config and downloader.config['course_urls']:
for course_url in downloader.config['course_urls']:
downloader.download_course(course_url)
else:
print("\n請在 config.json 中設定要下載的課程 URL")
except KeyboardInterrupt:
print("\n\n使用者中斷下載")
except Exception as e:
print(f"\n發生錯誤: {e}")
import traceback
traceback.print_exc()
finally:
if 'downloader' in locals():
downloader.close()
print("\n程式結束")
if __name__ == "__main__":
main()