Update
This commit is contained in:
411
coursera_downloader.py
Normal file
411
coursera_downloader.py
Normal file
@@ -0,0 +1,411 @@
|
||||
"""
|
||||
Coursera 課程備份工具
|
||||
功能:下載課程影片、文字內容和字幕
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import time
|
||||
import re
|
||||
import random
|
||||
from pathlib import Path
|
||||
from urllib.parse import urljoin, urlparse
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from selenium.webdriver.common.action_chains import ActionChains
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
from tqdm import tqdm
|
||||
import yt_dlp
|
||||
|
||||
|
||||
class CourseraDownloader:
|
||||
def __init__(self, config_path="config.json"):
|
||||
"""初始化下載器"""
|
||||
with open(config_path, 'r', encoding='utf-8') as f:
|
||||
self.config = json.load(f)
|
||||
|
||||
self.email = self.config.get('email')
|
||||
self.password = self.config.get('password')
|
||||
self.output_dir = self.config.get('output_dir', 'downloads')
|
||||
self.download_videos = self.config.get('download_videos', True)
|
||||
self.download_subtitles = self.config.get('download_subtitles', True)
|
||||
self.download_resources = self.config.get('download_resources', True)
|
||||
|
||||
# 安全設定:降低被偵測為機器人的風險
|
||||
self.delay_between_requests = self.config.get('delay_between_requests', 3) # 預設3秒
|
||||
self.random_delay = self.config.get('random_delay', True) # 隨機延遲
|
||||
self.max_retries = self.config.get('max_retries', 3) # 最大重試次數
|
||||
|
||||
self.session = requests.Session()
|
||||
# 設定 requests session 的 headers,模擬真實瀏覽器
|
||||
self.session.headers.update({
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'DNT': '1',
|
||||
'Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1'
|
||||
})
|
||||
self.driver = None
|
||||
|
||||
# 建立輸出目錄
|
||||
Path(self.output_dir).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def setup_driver(self):
|
||||
"""設定 Selenium WebDriver"""
|
||||
options = webdriver.ChromeOptions()
|
||||
|
||||
# 反機器人偵測設定
|
||||
options.add_argument('--disable-blink-features=AutomationControlled')
|
||||
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
|
||||
options.add_experimental_option('useAutomationExtension', False)
|
||||
|
||||
# 添加更多瀏覽器偽裝參數
|
||||
prefs = {
|
||||
'profile.default_content_setting_values': {
|
||||
'notifications': 2, # 禁用通知
|
||||
},
|
||||
'profile.managed_default_content_settings.images': 2, # 可選:禁用圖片加快速度
|
||||
}
|
||||
options.add_experimental_option('prefs', prefs)
|
||||
|
||||
# 使用真實的瀏覽器 User-Agent
|
||||
options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
|
||||
|
||||
# 其他安全設定
|
||||
options.add_argument('--disable-dev-shm-usage')
|
||||
options.add_argument('--no-sandbox')
|
||||
options.add_argument('--disable-gpu')
|
||||
options.add_argument('--disable-software-rasterizer')
|
||||
options.add_argument('--disable-extensions')
|
||||
options.add_argument('--start-maximized')
|
||||
|
||||
# 語言設定
|
||||
options.add_argument('--lang=zh-TW')
|
||||
|
||||
if self.config.get('headless', False):
|
||||
options.add_argument('--headless=new') # 使用新版 headless 模式
|
||||
options.add_argument('--window-size=1920,1080')
|
||||
|
||||
service = Service(ChromeDriverManager().install())
|
||||
self.driver = webdriver.Chrome(service=service, options=options)
|
||||
|
||||
if not self.config.get('headless', False):
|
||||
self.driver.maximize_window()
|
||||
|
||||
# 執行反偵測腳本
|
||||
self.driver.execute_cdp_cmd('Network.setUserAgentOverride', {
|
||||
"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||
})
|
||||
|
||||
# 隱藏多個 webdriver 特徵
|
||||
self.driver.execute_script("""
|
||||
Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
|
||||
Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]});
|
||||
Object.defineProperty(navigator, 'languages', {get: () => ['zh-TW', 'zh', 'en-US', 'en']});
|
||||
window.chrome = {runtime: {}};
|
||||
""")
|
||||
|
||||
def human_like_mouse_move(self, element):
|
||||
"""模擬人類的滑鼠移動到元素"""
|
||||
try:
|
||||
actions = ActionChains(self.driver)
|
||||
# 隨機移動到元素附近,然後移動到元素
|
||||
actions.move_to_element_with_offset(element,
|
||||
random.randint(-5, 5),
|
||||
random.randint(-5, 5))
|
||||
actions.pause(random.uniform(0.1, 0.3))
|
||||
actions.move_to_element(element)
|
||||
actions.perform()
|
||||
time.sleep(random.uniform(0.2, 0.5))
|
||||
except:
|
||||
pass
|
||||
|
||||
def human_like_scroll(self):
|
||||
"""模擬人類的滾動行為"""
|
||||
scroll_amount = random.randint(100, 500)
|
||||
self.driver.execute_script(f"window.scrollBy(0, {scroll_amount});")
|
||||
time.sleep(random.uniform(0.5, 1.5))
|
||||
|
||||
def login(self):
|
||||
"""登入 Coursera"""
|
||||
print("正在登入 Coursera...")
|
||||
self.driver.get("https://www.coursera.org/")
|
||||
self.safe_delay(3) # 安全延遲
|
||||
|
||||
# 模擬人類瀏覽行為
|
||||
self.human_like_scroll()
|
||||
|
||||
try:
|
||||
# 點擊登入按鈕
|
||||
login_button = WebDriverWait(self.driver, 10).until(
|
||||
EC.element_to_be_clickable((By.LINK_TEXT, "Log In"))
|
||||
)
|
||||
|
||||
# 模擬滑鼠移動到登入按鈕
|
||||
self.human_like_mouse_move(login_button)
|
||||
login_button.click()
|
||||
self.safe_delay(2) # 安全延遲
|
||||
|
||||
# 等待登入表單載入
|
||||
email_input = WebDriverWait(self.driver, 10).until(
|
||||
EC.presence_of_element_located((By.ID, "email"))
|
||||
)
|
||||
password_input = self.driver.find_element(By.ID, "password")
|
||||
|
||||
# 模擬點擊輸入框
|
||||
self.human_like_mouse_move(email_input)
|
||||
email_input.click()
|
||||
self.safe_delay(0.3)
|
||||
|
||||
# 逐字輸入,模擬真人打字(包含偶爾的停頓)
|
||||
for i, char in enumerate(self.email):
|
||||
email_input.send_keys(char)
|
||||
# 模擬打字節奏變化
|
||||
if i % 3 == 0:
|
||||
time.sleep(random.uniform(0.1, 0.25)) # 偶爾停頓
|
||||
else:
|
||||
time.sleep(random.uniform(0.05, 0.15))
|
||||
|
||||
self.safe_delay(0.5)
|
||||
|
||||
# 移動到密碼輸入框
|
||||
self.human_like_mouse_move(password_input)
|
||||
password_input.click()
|
||||
self.safe_delay(0.3)
|
||||
|
||||
for i, char in enumerate(self.password):
|
||||
password_input.send_keys(char)
|
||||
if i % 4 == 0:
|
||||
time.sleep(random.uniform(0.1, 0.25))
|
||||
else:
|
||||
time.sleep(random.uniform(0.05, 0.15))
|
||||
|
||||
self.safe_delay(1)
|
||||
|
||||
# 點擊登入按鈕
|
||||
submit_button = self.driver.find_element(By.CSS_SELECTOR, "button[type='submit']")
|
||||
self.human_like_mouse_move(submit_button)
|
||||
submit_button.click()
|
||||
|
||||
# 等待登入完成
|
||||
self.safe_delay(5)
|
||||
print("登入成功!")
|
||||
|
||||
# 取得 cookies 並同步到 requests session
|
||||
cookies = self.driver.get_cookies()
|
||||
for cookie in cookies:
|
||||
self.session.cookies.set(cookie['name'], cookie['value'])
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"登入失敗: {e}")
|
||||
return False
|
||||
|
||||
def safe_delay(self, base_delay=None):
|
||||
"""安全延遲:模擬人類操作,避免被偵測"""
|
||||
if base_delay is None:
|
||||
base_delay = self.delay_between_requests
|
||||
|
||||
if self.random_delay:
|
||||
# 加入隨機延遲(±50%)
|
||||
delay = base_delay * (0.5 + random.random())
|
||||
else:
|
||||
delay = base_delay
|
||||
|
||||
time.sleep(delay)
|
||||
|
||||
def sanitize_filename(self, filename):
|
||||
"""清理檔案名稱,移除不合法字元"""
|
||||
filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
|
||||
filename = filename.strip()
|
||||
return filename[:200] # 限制檔名長度
|
||||
|
||||
def download_file(self, url, filepath):
|
||||
"""下載檔案"""
|
||||
# 每次下載前加入隨機延遲
|
||||
self.safe_delay()
|
||||
|
||||
try:
|
||||
# 添加 Referer header,模擬從網頁點擊下載
|
||||
headers = self.session.headers.copy()
|
||||
headers['Referer'] = 'https://www.coursera.org/'
|
||||
|
||||
response = self.session.get(url, stream=True, timeout=30, headers=headers)
|
||||
response.raise_for_status()
|
||||
|
||||
total_size = int(response.headers.get('content-length', 0))
|
||||
|
||||
with open(filepath, 'wb') as f, tqdm(
|
||||
desc=os.path.basename(filepath),
|
||||
total=total_size,
|
||||
unit='iB',
|
||||
unit_scale=True,
|
||||
unit_divisor=1024,
|
||||
) as pbar:
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
size = f.write(chunk)
|
||||
pbar.update(size)
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"下載失敗 {url}: {e}")
|
||||
return False
|
||||
|
||||
def download_video_with_ytdlp(self, video_url, output_path):
|
||||
"""使用 yt-dlp 下載影片"""
|
||||
try:
|
||||
# 從瀏覽器取得 cookies
|
||||
cookies = {}
|
||||
for cookie in self.driver.get_cookies():
|
||||
cookies[cookie['name']] = cookie['value']
|
||||
|
||||
# 建立臨時 cookies 檔案
|
||||
cookie_file = os.path.join(self.output_dir, 'cookies.txt')
|
||||
with open(cookie_file, 'w') as f:
|
||||
f.write('# Netscape HTTP Cookie File\n')
|
||||
for name, value in cookies.items():
|
||||
f.write(f'.coursera.org\tTRUE\t/\tTRUE\t0\t{name}\t{value}\n')
|
||||
|
||||
ydl_opts = {
|
||||
'outtmpl': output_path,
|
||||
'format': 'best',
|
||||
'cookiefile': cookie_file,
|
||||
'quiet': False,
|
||||
'no_warnings': False,
|
||||
}
|
||||
|
||||
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
||||
ydl.download([video_url])
|
||||
|
||||
# 刪除臨時 cookies 檔案
|
||||
if os.path.exists(cookie_file):
|
||||
os.remove(cookie_file)
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"影片下載失敗: {e}")
|
||||
return False
|
||||
|
||||
def download_subtitles(self, video_element, output_dir):
|
||||
"""下載字幕檔案"""
|
||||
try:
|
||||
# 尋找字幕連結
|
||||
subtitle_tracks = video_element.find_elements(By.TAG_NAME, "track")
|
||||
|
||||
for track in subtitle_tracks:
|
||||
src = track.get_attribute('src')
|
||||
label = track.get_attribute('label') or 'subtitle'
|
||||
|
||||
if src:
|
||||
subtitle_filename = f"{self.sanitize_filename(label)}.vtt"
|
||||
subtitle_path = os.path.join(output_dir, subtitle_filename)
|
||||
|
||||
print(f"下載字幕: {label}")
|
||||
self.download_file(src, subtitle_path)
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"字幕下載失敗: {e}")
|
||||
return False
|
||||
|
||||
def download_course(self, course_url):
|
||||
"""下載整個課程"""
|
||||
print(f"\n開始備份課程: {course_url}")
|
||||
|
||||
# 設定 WebDriver
|
||||
if not self.driver:
|
||||
self.setup_driver()
|
||||
|
||||
# 登入
|
||||
if not self.login():
|
||||
print("無法登入,停止下載")
|
||||
return False
|
||||
|
||||
# 前往課程頁面
|
||||
self.driver.get(course_url)
|
||||
self.safe_delay(3)
|
||||
|
||||
# 模擬人類瀏覽行為
|
||||
self.human_like_scroll()
|
||||
self.safe_delay(2)
|
||||
|
||||
# 取得課程名稱
|
||||
try:
|
||||
course_title = self.driver.find_element(By.CSS_SELECTOR, "h1").text
|
||||
course_dir = os.path.join(self.output_dir, self.sanitize_filename(course_title))
|
||||
Path(course_dir).mkdir(parents=True, exist_ok=True)
|
||||
print(f"課程名稱: {course_title}")
|
||||
except:
|
||||
course_dir = os.path.join(self.output_dir, "unknown_course")
|
||||
Path(course_dir).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 儲存課程資訊
|
||||
course_info = {
|
||||
'title': course_title if 'course_title' in locals() else 'Unknown',
|
||||
'url': course_url,
|
||||
'download_date': time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
}
|
||||
|
||||
with open(os.path.join(course_dir, 'course_info.json'), 'w', encoding='utf-8') as f:
|
||||
json.dump(course_info, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"\n課程內容將儲存至: {course_dir}")
|
||||
print("\n注意:由於 Coursera 網站結構複雜,您可能需要:")
|
||||
print("1. 手動導航到課程的「週」或「模組」頁面")
|
||||
print("2. 使用瀏覽器開發者工具找到影片和資源的實際 URL")
|
||||
print("3. 根據您的具體課程結構調整程式碼")
|
||||
print("\n程式將保持瀏覽器開啟 60 秒,請手動檢查頁面結構...")
|
||||
|
||||
time.sleep(60)
|
||||
|
||||
return True
|
||||
|
||||
def close(self):
|
||||
"""關閉瀏覽器"""
|
||||
if self.driver:
|
||||
self.driver.quit()
|
||||
|
||||
|
||||
def main():
|
||||
"""主程式"""
|
||||
print("=" * 60)
|
||||
print("Coursera 課程備份工具")
|
||||
print("=" * 60)
|
||||
|
||||
# 檢查配置檔案
|
||||
if not os.path.exists('config.json'):
|
||||
print("\n錯誤:找不到 config.json 檔案")
|
||||
print("請參考 config.example.json 建立您的設定檔")
|
||||
return
|
||||
|
||||
try:
|
||||
downloader = CourseraDownloader()
|
||||
|
||||
# 取得課程 URL
|
||||
if 'course_urls' in downloader.config and downloader.config['course_urls']:
|
||||
for course_url in downloader.config['course_urls']:
|
||||
downloader.download_course(course_url)
|
||||
else:
|
||||
print("\n請在 config.json 中設定要下載的課程 URL")
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n\n使用者中斷下載")
|
||||
except Exception as e:
|
||||
print(f"\n發生錯誤: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
finally:
|
||||
if 'downloader' in locals():
|
||||
downloader.close()
|
||||
print("\n程式結束")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user