import re, json, mimetypes, os
from loguru import logger
from vk_url_scraper import VkScraper, DateTimeEncoder
from storages import Storage
from .base_archiver import Archiver, ArchiveResult
from configs import VkConfig
class VkArchiver(Archiver):
""""
VK videos are handled by YTDownloader, this archiver gets posts text and images.
Currently only works for /wall posts
"""
name = "vk"
wall_pattern = re.compile(r"(wall.{0,1}\d+_\d+)")
photo_pattern = re.compile(r"(photo.{0,1}\d+_\d+)")
def __init__(self, storage: Storage, driver, config: VkConfig):
super().__init__(storage, driver)
if config != None:
self.vks = VkScraper(config.username, config.password)
def download(self, url, check_if_exists=False):
if not hasattr(self, "vks") or self.vks is None:
logger.debug("VK archiver was not supplied with credentials.")
return False
key = self.get_html_key(url)
# if check_if_exists and self.storage.exists(key):
# screenshot = self.get_screenshot(url)
# cdn_url = self.storage.get_cdn_url(key)
# return ArchiveResult(status="already archived", cdn_url=cdn_url, screenshot=screenshot)
results = self.vks.scrape(url) # some urls can contain multiple wall/photo/... parts and all will be fetched
if len(results) == 0:
return False
def dump_payload(p): return json.dumps(p, ensure_ascii=False, indent=4, cls=DateTimeEncoder)
textual_output = ""
title, datetime = results[0]["text"], results[0]["datetime"]
urls_found = []
for res in results:
textual_output += f"id: {res['id']}
time utc: {res['datetime']}
text: {res['text']}
payload: {dump_payload(res['payload'])}