mirror of
https://github.com/Death916/c2cscrape.git
synced 2026-04-10 03:04:40 -07:00
re format
This commit is contained in:
parent
4624b927af
commit
c3f45dae6b
7 changed files with 81 additions and 18 deletions
7
.gitignore
vendored
7
.gitignore
vendored
|
|
@ -83,4 +83,9 @@ poetry.lock
|
|||
# XML generated files
|
||||
podcast_feed.xml
|
||||
feed.xml
|
||||
video_feed.xml
|
||||
video_feed.xml
|
||||
uv.lock
|
||||
*.svg
|
||||
|
||||
old.pyproject.toml
|
||||
.github
|
||||
|
|
|
|||
14
docker/docker-compose.yml
Normal file
14
docker/docker-compose.yml
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
version: '3.8'
|
||||
|
||||
services:
|
||||
c2c-scraper:
|
||||
image: death916/c2cscrape:latest
|
||||
volumes:
|
||||
- /mnt/media/media/books/audio/podcasts/C2C:/downloads
|
||||
- /mnt/media/docker/volumes/c2cscrape
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
- TZ=America/Los_Angeles
|
||||
|
||||
|
||||
|
||||
22
docker/dockerfile
Normal file
22
docker/dockerfile
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
FROM python:3.11-slim
|
||||
|
||||
# Set timezone
|
||||
ENV TZ=America/Los_Angeles
|
||||
RUN apt-get update && apt-get install -y tzdata
|
||||
|
||||
# Create app directory
|
||||
WORKDIR /app
|
||||
|
||||
# Install dependencies
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Copy source code
|
||||
COPY . .
|
||||
|
||||
# Set hardcoded download path
|
||||
ENV DOWNLOAD_DIR=/downloads
|
||||
|
||||
# Start the scraper
|
||||
CMD ["python", "-u", "c2cscrape.py"]
|
||||
|
||||
|
|
@ -1,18 +1,17 @@
|
|||
[tool.poetry]
|
||||
[project]
|
||||
name = "c2cscrape"
|
||||
version = "0.1.0"
|
||||
description = "scrape blog site for old c2c am episdodes and serve as rss"
|
||||
authors = ["Death916 <mail@trentnelson.dev>"]
|
||||
readme = "README.md"
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.11"
|
||||
playwright = "^1.50.0"
|
||||
bs4 = "^0.0.2"
|
||||
requests = "^2.32.3"
|
||||
feedgen = "^1.0.0"
|
||||
|
||||
authors = [{ name = "Death916", email = "mail@trentnelson.dev" }]
|
||||
source = "src"
|
||||
requires-python = ">=3.11"
|
||||
dependencies = [
|
||||
"bs4>=0.0.2",
|
||||
"feedgen>=1.0.0",
|
||||
"playwright>=1.50.0",
|
||||
"requests>=2.32.3",
|
||||
]
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@ class C2CScrape:
|
|||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||||
}
|
||||
self.episodes_downloaded = 0
|
||||
self.download_location = '/downloads'
|
||||
def sanitize_filename(self, filename):
|
||||
# Remove or replace invalid filename characters
|
||||
return re.sub(r'[<>:"/\\|?*]', '-', filename)
|
||||
|
|
@ -78,7 +79,7 @@ class C2CScrape:
|
|||
return
|
||||
|
||||
# Create downloads directory if it doesn't exist
|
||||
download_dir = 'downloads'
|
||||
download_dir = '/downloads'
|
||||
os.makedirs(download_dir, exist_ok=True)
|
||||
|
||||
# Get current date
|
||||
|
|
@ -154,7 +155,17 @@ class C2CScrape:
|
|||
|
||||
|
||||
# timer to check for new episodes every 12 hours
|
||||
def timer(self):
|
||||
|
||||
try:
|
||||
# Run our core operations
|
||||
self.process_episode()
|
||||
self.get_older_posts()
|
||||
print(f'Episodes downloaded: {self.episodes_downloaded}')
|
||||
finally:
|
||||
# Ensure timer restarts even if there's an error
|
||||
print("waiting 12 hours")
|
||||
threading.Timer(43200, self.timer).start() # 43200 sec = 12 hours
|
||||
# navigate to older posts button 5 times and get last 5 episodes with no repeats/ span id is blog-pager-older-link
|
||||
def get_older_posts(self, limit=5):
|
||||
try:
|
||||
|
|
@ -194,9 +205,15 @@ class C2CScrape:
|
|||
|
||||
if __name__ == '__main__':
|
||||
c2c = C2CScrape()
|
||||
c2c.process_episode()
|
||||
c2c.get_older_posts()
|
||||
print(f'Episodes downloaded: {c2c.episodes_downloaded}')
|
||||
# Start initial timer immediately
|
||||
c2c.timer()
|
||||
# Keep main thread alive with minimal resource usage
|
||||
try:
|
||||
while True:
|
||||
time.sleep(3600) # Check once per hour
|
||||
except KeyboardInterrupt:
|
||||
print("\nStopping scheduled downloads...")
|
||||
print(f'Episodes downloaded: {c2c.episodes_downloaded}')
|
||||
|
||||
#rss = createRss()
|
||||
#rss.process_episodes()
|
||||
6
src/main.py
Normal file
6
src/main.py
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
def main():
|
||||
print("Hello from c2cscrape!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue