mirror of
https://github.com/Death916/c2cscrape.git
synced 2026-04-10 03:04:40 -07:00
redo parsing do to zfire being down. added initial knaben search parse
This commit is contained in:
parent
b61b888b22
commit
d73b179181
6 changed files with 79 additions and 180 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -89,3 +89,4 @@ uv.lock
|
||||||
|
|
||||||
old.pyproject.toml
|
old.pyproject.toml
|
||||||
.github
|
.github
|
||||||
|
.flox
|
||||||
|
|
|
||||||
1
.python-version
Normal file
1
.python-version
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
3.13
|
||||||
0
README.md
Normal file
0
README.md
Normal file
|
|
@ -3,7 +3,7 @@ name = "c2cscrape"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
description = "Add your description here"
|
description = "Add your description here"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.11"
|
requires-python = ">=3.13"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bs4>=0.0.2",
|
"bs4>=0.0.2",
|
||||||
"requests>=2.32.5",
|
"requests>=2.32.5",
|
||||||
|
|
|
||||||
21
shell.nix
Normal file
21
shell.nix
Normal file
|
|
@ -0,0 +1,21 @@
|
||||||
|
let
|
||||||
|
nixconfig = builtins.getFlake "github:death916/nixconfig";
|
||||||
|
unstable = nixconfig.inputs.nixpkgs-unstable.legacyPackages.x86_64-linux;
|
||||||
|
pkgs = nixconfig.inputs.nixpkgs.legacyPackages.x86_64-linux;
|
||||||
|
in
|
||||||
|
pkgs.mkShell {
|
||||||
|
packages = with pkgs; [
|
||||||
|
python313Packages.uv
|
||||||
|
python313Packages.ninja
|
||||||
|
python313Packages.numpy
|
||||||
|
bun
|
||||||
|
|
||||||
|
];
|
||||||
|
shellHook = ''
|
||||||
|
source .venv/bin/activate
|
||||||
|
# export PATH="${pkgs.bun}/bin:$PATH"
|
||||||
|
# export BUN_INSTALL="${pkgs.bun}/bin/bun"
|
||||||
|
export REFLEX_USE_SYSTEM_BUN=True
|
||||||
|
echo venv activated and bun version set
|
||||||
|
'';
|
||||||
|
}
|
||||||
210
src/c2cscrape.py
210
src/c2cscrape.py
|
|
@ -5,16 +5,16 @@ import os
|
||||||
import random
|
import random
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
from calendar import c
|
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
class C2CScrape:
|
class TorrentScrape:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.url = "https://zfirelight.blogspot.com/"
|
self.url = "https://knaben.org/search/coast%20to%20coast%20am/0/1/date"
|
||||||
self.episodes = []
|
self.episodes = []
|
||||||
|
self.download_amount = 5
|
||||||
self.last_download = None
|
self.last_download = None
|
||||||
self.last_download_link = None
|
self.last_download_link = None
|
||||||
self.headers = {
|
self.headers = {
|
||||||
|
|
@ -27,188 +27,64 @@ class C2CScrape:
|
||||||
# Remove or replace invalid filename characters
|
# Remove or replace invalid filename characters
|
||||||
return re.sub(r'[<>:"/\\|?*]', "-", filename)
|
return re.sub(r'[<>:"/\\|?*]', "-", filename)
|
||||||
|
|
||||||
def get_drive_link(self, url):
|
def get_torrent_page(self):
|
||||||
try:
|
try:
|
||||||
response = requests.get(url, headers=self.headers)
|
response = requests.get(self.url, headers=self.headers)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
self.soup = BeautifulSoup(response.text, "html.parser")
|
print(response)
|
||||||
|
return response.text
|
||||||
iframes = self.soup.find_all("iframe")
|
|
||||||
for iframe in iframes:
|
|
||||||
src = iframe.get("src")
|
|
||||||
if src and "drive.google.com" in src:
|
|
||||||
print("Found drive link:", src)
|
|
||||||
return src
|
|
||||||
except requests.RequestException as e:
|
except requests.RequestException as e:
|
||||||
print(f"Error fetching page: {e}")
|
print(f"Error fetching page: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def get_episode_info(self, soup):
|
def get_episode_info(self):
|
||||||
title_element = soup.find("h3", class_="post-title entry-title")
|
page = self.get_torrent_page()
|
||||||
if not title_element:
|
if not page:
|
||||||
|
print("No page found")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
title_link = title_element.find("a")
|
soup = BeautifulSoup(page, "html.parser")
|
||||||
if not title_link:
|
main_class = soup.find("div", class_="p-3")
|
||||||
|
|
||||||
|
if not main_class:
|
||||||
|
print("No main class found")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
full_title = title_link.text
|
episode_elems = main_class.select(".text-wrap.w-100")
|
||||||
date_str = full_title.split(" ")[0]
|
if episode_elems:
|
||||||
|
print(
|
||||||
return {"title": full_title, "date": date_str, "url": title_link["href"]}
|
f"Found {len(episode_elems)} episode elements using selector .text-wrap.w-100"
|
||||||
|
|
||||||
def create_download_link(self):
|
|
||||||
url = self.get_drive_link(self.url)
|
|
||||||
if not url:
|
|
||||||
return None
|
|
||||||
|
|
||||||
print("Creating download link for:", url)
|
|
||||||
try:
|
|
||||||
cleaned_url = url.split("/file/d/")[1].split("/")[0]
|
|
||||||
download_link = (
|
|
||||||
f"https://drive.google.com/uc?export=download&id={cleaned_url}"
|
|
||||||
)
|
)
|
||||||
print("Download link:", download_link)
|
if len(episode_elems) > self.download_amount:
|
||||||
return download_link
|
print(
|
||||||
except IndexError:
|
f"Too many episodes found, only downloading last {self.download_amount}"
|
||||||
print("Error: Invalid URL format")
|
)
|
||||||
return None
|
episode_elems = episode_elems[: self.download_amount]
|
||||||
|
for ep in episode_elems:
|
||||||
def download_episode(self, url):
|
a = ep.find("a")
|
||||||
try:
|
if not a:
|
||||||
episode_data = self.get_episode_info(self.soup)
|
continue
|
||||||
if not episode_data:
|
title = a.get("title") or a.get_text(strip=True)
|
||||||
print("Error: Could not get episode info")
|
# skip episodes that don't start with "Coast" as sometimes they show up in search
|
||||||
return
|
if not title.startswith("Coast"):
|
||||||
|
print(f"Skipping episode {title}")
|
||||||
# Create downloads directory if it doesn't exist
|
continue
|
||||||
download_dir = "./downloads"
|
link = a.get("href")
|
||||||
os.makedirs(download_dir, exist_ok=True)
|
|
||||||
|
|
||||||
# Get current date
|
|
||||||
date = datetime.datetime.now().strftime("%Y-%m-%d")
|
|
||||||
|
|
||||||
# Create sanitized filename
|
|
||||||
filename = f"{episode_data['title']}.mp4"
|
|
||||||
safe_filename = self.sanitize_filename(filename)
|
|
||||||
filepath = os.path.join(download_dir, safe_filename)
|
|
||||||
|
|
||||||
# Check if file already exists
|
|
||||||
if os.path.exists(filepath):
|
|
||||||
print(f"File already exists: {safe_filename}")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Download the file
|
|
||||||
response = requests.get(url, headers=self.headers)
|
|
||||||
response.raise_for_status()
|
|
||||||
|
|
||||||
with open(filepath, "wb") as f:
|
|
||||||
f.write(response.content)
|
|
||||||
print(f"Downloaded: {safe_filename}")
|
|
||||||
print("sleeping for 3-7 seconds")
|
|
||||||
time.sleep(random.randint(3, 7))
|
|
||||||
|
|
||||||
# Update last download info
|
|
||||||
self.episodes_downloaded += 1
|
self.episodes_downloaded += 1
|
||||||
self.last_download = date
|
print(f"found episode {title}")
|
||||||
self.last_download_link = url
|
# print(f"link: {link}")
|
||||||
|
|
||||||
except requests.RequestException as e:
|
print("done")
|
||||||
print(f"Error downloading episode: {e}")
|
return # need to return link later to qbit but need to decide logic
|
||||||
except Exception as e:
|
|
||||||
print(f"Error: {e}")
|
|
||||||
|
|
||||||
def is_duplicate_file(self, soup):
|
|
||||||
try:
|
|
||||||
episode_data = self.get_episode_info(soup)
|
|
||||||
if not episode_data:
|
|
||||||
return False
|
|
||||||
|
|
||||||
date = datetime.datetime.now().strftime("%Y-%m-%d")
|
class Qbittorrent:
|
||||||
filename = f"{episode_data['title']} {date}.mp4"
|
pass
|
||||||
safe_filename = self.sanitize_filename(filename)
|
|
||||||
filepath = os.path.join("downloads", safe_filename)
|
|
||||||
|
|
||||||
return os.path.exists(filepath)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error checking duplicate: {e}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
def process_episode(self):
|
|
||||||
try:
|
|
||||||
drive_url = self.get_drive_link(self.url) # This sets self.soup
|
|
||||||
if not drive_url:
|
|
||||||
return
|
|
||||||
|
|
||||||
if self.is_duplicate_file(self.soup):
|
|
||||||
print("Episode already exists, skipping download")
|
|
||||||
return
|
|
||||||
|
|
||||||
download_url = self.create_download_link()
|
|
||||||
if download_url:
|
|
||||||
self.download_episode(download_url)
|
|
||||||
|
|
||||||
except requests.RequestException as e:
|
|
||||||
print(f"Error processing episode: {e}")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error: {e}")
|
|
||||||
|
|
||||||
# timer to check for new episodes every 12 hours
|
|
||||||
def start(self):
|
|
||||||
# reset base url
|
|
||||||
self.url = "https://zfirelight.blogspot.com/"
|
|
||||||
original_url = self.url
|
|
||||||
try:
|
|
||||||
# Run our core operations
|
|
||||||
self.process_episode()
|
|
||||||
self.get_older_posts()
|
|
||||||
print(f"Episodes downloaded: {self.episodes_downloaded}")
|
|
||||||
finally:
|
|
||||||
self.url = original_url
|
|
||||||
self.last_download = None
|
|
||||||
self.last_download_link = None
|
|
||||||
self.episodes_downloaded = 0
|
|
||||||
|
|
||||||
# navigate to older posts button 5 times and get last 5 episodes with no repeats/ span id is blog-pager-older-link
|
|
||||||
def get_older_posts(self, limit=5):
|
|
||||||
try:
|
|
||||||
response = requests.get(self.url, headers=self.headers)
|
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
|
||||||
older_posts = soup.find("span", id="blog-pager-older-link")
|
|
||||||
processed_urls = set()
|
|
||||||
posts_processed = 0
|
|
||||||
|
|
||||||
while older_posts and posts_processed < limit:
|
|
||||||
older_link = older_posts.find("a")["href"]
|
|
||||||
|
|
||||||
if older_link in processed_urls:
|
|
||||||
break
|
|
||||||
processed_urls.add(older_link)
|
|
||||||
|
|
||||||
# Get the older posts page
|
|
||||||
self.url = older_link # Update URL to use existing functions
|
|
||||||
print(f"Processing page: {older_link}")
|
|
||||||
|
|
||||||
# Use existing process_episode method
|
|
||||||
self.process_episode()
|
|
||||||
posts_processed += 1
|
|
||||||
|
|
||||||
# Get next page of older posts
|
|
||||||
response = requests.get(older_link, headers=self.headers)
|
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
|
||||||
older_posts = soup.find("span", id="blog-pager-older-link")
|
|
||||||
|
|
||||||
except requests.RequestException as e:
|
|
||||||
print(f"Error fetching older posts: {e}")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error: {e}")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
c2c = C2CScrape()
|
c2c = TorrentScrape()
|
||||||
# Start initial timer immediately
|
c2c.get_episode_info()
|
||||||
c2c.start()
|
|
||||||
# Keep main thread alive with minimal resource usage
|
# Keep main thread alive with minimal resource usage
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue