re format

This commit is contained in:
Death916 2025-02-20 05:28:09 -08:00
parent 4624b927af
commit c3f45dae6b
7 changed files with 81 additions and 18 deletions

7
.gitignore vendored
View file

@ -83,4 +83,9 @@ poetry.lock
# XML generated files
podcast_feed.xml
feed.xml
video_feed.xml
video_feed.xml
uv.lock
*.svg
old.pyproject.toml
.github

14
docker/docker-compose.yml Normal file
View file

@ -0,0 +1,14 @@
version: '3.8'
services:
c2c-scraper:
image: death916/c2cscrape:latest
volumes:
- /mnt/media/media/books/audio/podcasts/C2C:/downloads
- /mnt/media/docker/volumes/c2cscrape
restart: unless-stopped
environment:
- TZ=America/Los_Angeles

22
docker/dockerfile Normal file
View file

@ -0,0 +1,22 @@
FROM python:3.11-slim
# Set timezone
ENV TZ=America/Los_Angeles
RUN apt-get update && apt-get install -y tzdata
# Create app directory
WORKDIR /app
# Install dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy source code
COPY . .
# Set hardcoded download path
ENV DOWNLOAD_DIR=/downloads
# Start the scraper
CMD ["python", "-u", "c2cscrape.py"]

View file

@ -1,18 +1,17 @@
[tool.poetry]
[project]
name = "c2cscrape"
version = "0.1.0"
description = "scrape blog site for old c2c am episdodes and serve as rss"
authors = ["Death916 <mail@trentnelson.dev>"]
readme = "README.md"
[tool.poetry.dependencies]
python = "^3.11"
playwright = "^1.50.0"
bs4 = "^0.0.2"
requests = "^2.32.3"
feedgen = "^1.0.0"
authors = [{ name = "Death916", email = "mail@trentnelson.dev" }]
source = "src"
requires-python = ">=3.11"
dependencies = [
"bs4>=0.0.2",
"feedgen>=1.0.0",
"playwright>=1.50.0",
"requests>=2.32.3",
]
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
requires = ["hatchling"]
build-backend = "hatchling.build"

View file

@ -17,6 +17,7 @@ class C2CScrape:
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
self.episodes_downloaded = 0
self.download_location = '/downloads'
def sanitize_filename(self, filename):
# Remove or replace invalid filename characters
return re.sub(r'[<>:"/\\|?*]', '-', filename)
@ -78,7 +79,7 @@ class C2CScrape:
return
# Create downloads directory if it doesn't exist
download_dir = 'downloads'
download_dir = '/downloads'
os.makedirs(download_dir, exist_ok=True)
# Get current date
@ -154,7 +155,17 @@ class C2CScrape:
# timer to check for new episodes every 12 hours
def timer(self):
try:
# Run our core operations
self.process_episode()
self.get_older_posts()
print(f'Episodes downloaded: {self.episodes_downloaded}')
finally:
# Ensure timer restarts even if there's an error
print("waiting 12 hours")
threading.Timer(43200, self.timer).start() # 43200 sec = 12 hours
# navigate to older posts button 5 times and get last 5 episodes with no repeats/ span id is blog-pager-older-link
def get_older_posts(self, limit=5):
try:
@ -194,9 +205,15 @@ class C2CScrape:
if __name__ == '__main__':
c2c = C2CScrape()
c2c.process_episode()
c2c.get_older_posts()
print(f'Episodes downloaded: {c2c.episodes_downloaded}')
# Start initial timer immediately
c2c.timer()
# Keep main thread alive with minimal resource usage
try:
while True:
time.sleep(3600) # Check once per hour
except KeyboardInterrupt:
print("\nStopping scheduled downloads...")
print(f'Episodes downloaded: {c2c.episodes_downloaded}')
#rss = createRss()
#rss.process_episodes()

6
src/main.py Normal file
View file

@ -0,0 +1,6 @@
def main():
print("Hello from c2cscrape!")
if __name__ == "__main__":
main()

View file