mirror of
https://github.com/Death916/c2cscrape.git
synced 2026-04-10 03:04:40 -07:00
re format
This commit is contained in:
parent
4624b927af
commit
c3f45dae6b
7 changed files with 81 additions and 18 deletions
7
.gitignore
vendored
7
.gitignore
vendored
|
|
@ -83,4 +83,9 @@ poetry.lock
|
||||||
# XML generated files
|
# XML generated files
|
||||||
podcast_feed.xml
|
podcast_feed.xml
|
||||||
feed.xml
|
feed.xml
|
||||||
video_feed.xml
|
video_feed.xml
|
||||||
|
uv.lock
|
||||||
|
*.svg
|
||||||
|
|
||||||
|
old.pyproject.toml
|
||||||
|
.github
|
||||||
|
|
|
||||||
14
docker/docker-compose.yml
Normal file
14
docker/docker-compose.yml
Normal file
|
|
@ -0,0 +1,14 @@
|
||||||
|
version: '3.8'
|
||||||
|
|
||||||
|
services:
|
||||||
|
c2c-scraper:
|
||||||
|
image: death916/c2cscrape:latest
|
||||||
|
volumes:
|
||||||
|
- /mnt/media/media/books/audio/podcasts/C2C:/downloads
|
||||||
|
- /mnt/media/docker/volumes/c2cscrape
|
||||||
|
restart: unless-stopped
|
||||||
|
environment:
|
||||||
|
- TZ=America/Los_Angeles
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
22
docker/dockerfile
Normal file
22
docker/dockerfile
Normal file
|
|
@ -0,0 +1,22 @@
|
||||||
|
FROM python:3.11-slim
|
||||||
|
|
||||||
|
# Set timezone
|
||||||
|
ENV TZ=America/Los_Angeles
|
||||||
|
RUN apt-get update && apt-get install -y tzdata
|
||||||
|
|
||||||
|
# Create app directory
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
# Copy source code
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Set hardcoded download path
|
||||||
|
ENV DOWNLOAD_DIR=/downloads
|
||||||
|
|
||||||
|
# Start the scraper
|
||||||
|
CMD ["python", "-u", "c2cscrape.py"]
|
||||||
|
|
||||||
|
|
@ -1,18 +1,17 @@
|
||||||
[tool.poetry]
|
[project]
|
||||||
name = "c2cscrape"
|
name = "c2cscrape"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
description = "scrape blog site for old c2c am episdodes and serve as rss"
|
description = "scrape blog site for old c2c am episdodes and serve as rss"
|
||||||
authors = ["Death916 <mail@trentnelson.dev>"]
|
authors = [{ name = "Death916", email = "mail@trentnelson.dev" }]
|
||||||
readme = "README.md"
|
source = "src"
|
||||||
|
requires-python = ">=3.11"
|
||||||
[tool.poetry.dependencies]
|
dependencies = [
|
||||||
python = "^3.11"
|
"bs4>=0.0.2",
|
||||||
playwright = "^1.50.0"
|
"feedgen>=1.0.0",
|
||||||
bs4 = "^0.0.2"
|
"playwright>=1.50.0",
|
||||||
requests = "^2.32.3"
|
"requests>=2.32.3",
|
||||||
feedgen = "^1.0.0"
|
]
|
||||||
|
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = ["poetry-core"]
|
requires = ["hatchling"]
|
||||||
build-backend = "poetry.core.masonry.api"
|
build-backend = "hatchling.build"
|
||||||
|
|
|
||||||
|
|
@ -17,6 +17,7 @@ class C2CScrape:
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||||||
}
|
}
|
||||||
self.episodes_downloaded = 0
|
self.episodes_downloaded = 0
|
||||||
|
self.download_location = '/downloads'
|
||||||
def sanitize_filename(self, filename):
|
def sanitize_filename(self, filename):
|
||||||
# Remove or replace invalid filename characters
|
# Remove or replace invalid filename characters
|
||||||
return re.sub(r'[<>:"/\\|?*]', '-', filename)
|
return re.sub(r'[<>:"/\\|?*]', '-', filename)
|
||||||
|
|
@ -78,7 +79,7 @@ class C2CScrape:
|
||||||
return
|
return
|
||||||
|
|
||||||
# Create downloads directory if it doesn't exist
|
# Create downloads directory if it doesn't exist
|
||||||
download_dir = 'downloads'
|
download_dir = '/downloads'
|
||||||
os.makedirs(download_dir, exist_ok=True)
|
os.makedirs(download_dir, exist_ok=True)
|
||||||
|
|
||||||
# Get current date
|
# Get current date
|
||||||
|
|
@ -154,7 +155,17 @@ class C2CScrape:
|
||||||
|
|
||||||
|
|
||||||
# timer to check for new episodes every 12 hours
|
# timer to check for new episodes every 12 hours
|
||||||
|
def timer(self):
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Run our core operations
|
||||||
|
self.process_episode()
|
||||||
|
self.get_older_posts()
|
||||||
|
print(f'Episodes downloaded: {self.episodes_downloaded}')
|
||||||
|
finally:
|
||||||
|
# Ensure timer restarts even if there's an error
|
||||||
|
print("waiting 12 hours")
|
||||||
|
threading.Timer(43200, self.timer).start() # 43200 sec = 12 hours
|
||||||
# navigate to older posts button 5 times and get last 5 episodes with no repeats/ span id is blog-pager-older-link
|
# navigate to older posts button 5 times and get last 5 episodes with no repeats/ span id is blog-pager-older-link
|
||||||
def get_older_posts(self, limit=5):
|
def get_older_posts(self, limit=5):
|
||||||
try:
|
try:
|
||||||
|
|
@ -194,9 +205,15 @@ class C2CScrape:
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
c2c = C2CScrape()
|
c2c = C2CScrape()
|
||||||
c2c.process_episode()
|
# Start initial timer immediately
|
||||||
c2c.get_older_posts()
|
|
||||||
print(f'Episodes downloaded: {c2c.episodes_downloaded}')
|
|
||||||
c2c.timer()
|
c2c.timer()
|
||||||
|
# Keep main thread alive with minimal resource usage
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
time.sleep(3600) # Check once per hour
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\nStopping scheduled downloads...")
|
||||||
|
print(f'Episodes downloaded: {c2c.episodes_downloaded}')
|
||||||
|
|
||||||
#rss = createRss()
|
#rss = createRss()
|
||||||
#rss.process_episodes()
|
#rss.process_episodes()
|
||||||
6
src/main.py
Normal file
6
src/main.py
Normal file
|
|
@ -0,0 +1,6 @@
|
||||||
|
def main():
|
||||||
|
print("Hello from c2cscrape!")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
Add table
Add a link
Reference in a new issue