From 247e6732bfd72f7ff1308fd281bab0afb8fe29d1 Mon Sep 17 00:00:00 2001 From: Victor Westerlund Date: Wed, 2 Mar 2022 04:06:13 +0100 Subject: [PATCH] pre-v1.0 (#1) * wip(22w8a): add wip db and gcs client * wip(22w8b): bootstrapping fix * wip(22w8c): add first-run sql config * wip(22w8d): add sqlite abstraction * wip(22w8e): add filesystem handler * wip(22w8f): add folder walker * wip(22w8a): finish db writer * wip(22w9a): add item zipper * wip(22w9b): add gcs upload * Create README.md Co-authored-by: Cloud Shell --- .env.example | 7 +++++ .gitignore | 52 ++++++++++++++++++++++++++++++++ README.md | 43 +++++++++++++++++++++++++++ backup.py | 5 ++++ requirements.txt | 2 ++ src/__init__.py | 9 ++++++ src/backup.py | 64 +++++++++++++++++++++++++++++++++++++++ src/cloud/__init__.py | 30 +++++++++++++++++++ src/cloud/gcs.py | 26 ++++++++++++++++ src/db/__init__.py | 2 ++ src/db/config.sql | 15 ++++++++++ src/db/database.py | 53 +++++++++++++++++++++++++++++++++ src/db/flags.py | 16 ++++++++++ src/db/sqlite.py | 69 +++++++++++++++++++++++++++++++++++++++++++ src/fs/__init__.py | 2 ++ src/fs/fs.py | 58 ++++++++++++++++++++++++++++++++++++ src/fs/utils.py | 15 ++++++++++ 17 files changed, 468 insertions(+) create mode 100644 .env.example create mode 100644 .gitignore create mode 100644 README.md create mode 100644 backup.py create mode 100644 requirements.txt create mode 100644 src/__init__.py create mode 100644 src/backup.py create mode 100644 src/cloud/__init__.py create mode 100644 src/cloud/gcs.py create mode 100644 src/db/__init__.py create mode 100644 src/db/config.sql create mode 100644 src/db/database.py create mode 100644 src/db/flags.py create mode 100644 src/db/sqlite.py create mode 100644 src/fs/__init__.py create mode 100644 src/fs/fs.py create mode 100644 src/fs/utils.py diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..9ba18a7 --- /dev/null +++ b/.env.example @@ -0,0 +1,7 @@ +SOURCE_FOLDER= +TARGET_BUCKET= + +# Cloud provider "gcs, aws, azure" +SERVICE_NAME= +# Path to service account key file +SERVICE_KEY= \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4f392c7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,52 @@ +# Bootstrapping # +################# +/node_modules +/public/hot +/public/storage +/storage/*.key +/vendor +.env +.env.backup +.phpunit.result.cache +Homestead.json +Homestead.yaml +npm-debug.log +yarn-error.log +public/robots.txt +__pycache__ +*.pyc + +# OS generated files # +###################### +.DS_Store +.DS_Store? +._* +.Spotlight-V100 +.Trashes +Icon? +ehthumbs.db +Thumbs.db +.directory + +# Tool specific files # +####################### +# vim +*~ +*.swp +*.swo +# sublime text & textmate +*.sublime-* +*.stTheme.cache +*.tmlanguage.cache +*.tmPreferences.cache +# Eclipse +.settings/* +# JetBrains, aka PHPStorm, IntelliJ IDEA +.idea/* +# NetBeans +nbproject/* +# Visual Studio Code +.vscode +.theia +# Sass preprocessor +.sass-cache/ \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..4dfdfd4 --- /dev/null +++ b/README.md @@ -0,0 +1,43 @@ +# Cloud Backup +Backup and archive ordinary files and folders to Google Cloud, AWS or Azure. + +## Get started +This program requires Python 3.6 or newer with PIP. + +1. **Clone this repo** +``` +git clone https://github.com/VictorWesterlund/cloud-backup +``` + +2. **Install dependencies** +``` +python3 -m pip install -r requirements.txt +``` + +3. **Copy environment variables file** +``` +cp .env.example .env +``` + +4. **Edit environment variables** +Open `.env` with your text editor of choice and fill out these required variables +```bash +# Path to the local folder to back up +SOURCE_FOLDER= +# Name of the remote bucket (destination) +TARGET_BUCKET= + +# Cloud provider (gcs, s3, azure) +SERVICE_NAME= +# Path to service account key file +SERVICE_KEY= +``` + +5. **Run backup script** +``` +python3 backup.py +``` + +Second-level files and folders should now start uploading to your destination bucket as zip archives. +Subsequent runs of the `backup.py` script will only upload changed files and folders. +In-fact; modified state is cached locally and doesn't request anything from your cloud provider. diff --git a/backup.py b/backup.py new file mode 100644 index 0000000..31763ab --- /dev/null +++ b/backup.py @@ -0,0 +1,5 @@ +import sys + +from src import Backup + +Backup().backup_all() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..dd9601f --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +python-dotenv +google-cloud-storage \ No newline at end of file diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..efa8b99 --- /dev/null +++ b/src/__init__.py @@ -0,0 +1,9 @@ +from dotenv import load_dotenv +from .db import Database, dbname +from .fs import FileSystem, file_exists +from .backup import Backup + +if not file_exists(".env"): + raise FileNotFoundError("Environment variable file does not exist. Copy '.env.example' to '.env'") + +load_dotenv() \ No newline at end of file diff --git a/src/backup.py b/src/backup.py new file mode 100644 index 0000000..7f625f8 --- /dev/null +++ b/src/backup.py @@ -0,0 +1,64 @@ +from typing import Union + +from .cloud import Storage as StorageClient +from . import Database, FileSystem +from . import dbname + +class Backup(FileSystem): + def __init__(self): + super().__init__() + + self.has_change = False + + self.db = Database() + self.cloud = StorageClient() + + self.compress = self.db.get_flag("COMPRESS") + + # Backup a file or folder + def backup_item(self, item: Union[list, str]) -> bool: + if isinstance(item, str): + item = self.get_item(item) + + # Check item against db if it has changed + db_resp = self.db.check_item(item) + if not db_resp: + return + + # Back up changes to database in silence + if item[0].endswith(dbname): + self.db.set_item(item) + return + + self.has_change = True + + print(f"Uploading: '{item[0]}' ... ", end="") + + blob = item + # Upload as zip archive + if self.compress: + blob = FileSystem.zip(blob) + + # Upload to cloud + if self.cloud.upload(blob): + # Update local database + if self.db.set_item(item): + print("OK") + else: + print("OK, but failed to update database") + else: + print("FAILED") + + # Remove temp zip + if self.compress: + FileSystem.delete(blob) + return + + # Scan TARGET_FOLDER for files and folders to back up + def backup_all(self): + # Check all second-level files and folder at target path + for item in self.all(): + self.backup_item(item) + + if not self.has_change: + print("Up to date. No changes found") \ No newline at end of file diff --git a/src/cloud/__init__.py b/src/cloud/__init__.py new file mode 100644 index 0000000..dccc947 --- /dev/null +++ b/src/cloud/__init__.py @@ -0,0 +1,30 @@ +import os +import importlib + +# This class initializes only the module for the requested service. +# It sits as an intermediate between the initiator script and client library. +class Storage: + def __init__(self): + self._service = None + self.service = os.getenv("SERVICE_NAME") + + @property + def service(self): + return self._service + + # Create a new storage client for the requested service + @service.setter + def service(self, service: str): + if not service: + service = "gcs" + module = importlib.import_module("src.cloud." + service) + + self._service = module.StorageClient() + + @staticmethod + def get_args(values): + values.pop(-1) + return values + + def upload(self, *argv): + return self.service.upload(*argv) \ No newline at end of file diff --git a/src/cloud/gcs.py b/src/cloud/gcs.py new file mode 100644 index 0000000..94d2716 --- /dev/null +++ b/src/cloud/gcs.py @@ -0,0 +1,26 @@ +import os +from google.cloud import storage + +from ..fs.utils import get_file + +# Client for Google Cloud Storage +class StorageClient: + def __init__(self): + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.getenv("SERVICE_KEY") + + client = storage.Client() + self.bucket = client.bucket(self.get_bucket()) + + def get_bucket(self): + return os.getenv("TARGET_BUCKET") + + def upload(self, path: str) -> bool: + name = get_file(path) + blob = self.bucket.blob(name) + + try: + with open(path, "rb") as f: + blob.upload_from_file(f) + return True + except: + return False \ No newline at end of file diff --git a/src/db/__init__.py b/src/db/__init__.py new file mode 100644 index 0000000..ff55f12 --- /dev/null +++ b/src/db/__init__.py @@ -0,0 +1,2 @@ +from .sqlite import dbname +from .database import Database \ No newline at end of file diff --git a/src/db/config.sql b/src/db/config.sql new file mode 100644 index 0000000..0c60e01 --- /dev/null +++ b/src/db/config.sql @@ -0,0 +1,15 @@ +CREATE TABLE flags ( + k TEXT PRIMARY KEY, + v INTEGER +); + +CREATE TABLE manifest ( + anchor TEXT PRIMARY KEY, + chksum INTEGER +); + +INSERT INTO flags +VALUES + ("COMPRESS", 1), + ("BUCKET_OK", 0), + ("INIT", 1); \ No newline at end of file diff --git a/src/db/database.py b/src/db/database.py new file mode 100644 index 0000000..8eebf39 --- /dev/null +++ b/src/db/database.py @@ -0,0 +1,53 @@ +import os +from typing import Union + +from .sqlite import SQLite + +class Database(SQLite): + def __init__(self): + super().__init__() + + self._columns = ["anchor", "chksum"] + + @property + def columns(self): + return ",".join(self._columns) + + @columns.setter + def columns(self, columns: list): + self._columns = columns + + # Create SQL string CSV from list + @staticmethod + def str_csv(items: Union[list, tuple]) -> str: + items = list(map(lambda value : f"'{str(value)}'", items)) + items = ",".join(items) + + return items + + # Check if item exists in the database + def item_exists(self, item: Union[list, tuple]) -> bool: + sql = f"SELECT anchor FROM manifest WHERE anchor = '{item[0]}'" + res = self.query(sql) + + return res + + # Check if item should be backed up by comparing mtime and checksum + def check_item(self, item: Union[list, tuple]) -> bool: + sql = f"SELECT {self.columns} FROM manifest WHERE anchor = '{item[0]}'" + db_item = self.query(sql) + + # New item or item changed, so back it up + if not db_item or (item != db_item[0]): + return True + return False + + # Insert or update item in database + def set_item(self, item: Union[list, tuple]) -> bool: + sql = f"UPDATE manifest SET anchor = '{item[0]}', chksum = {item[1]} WHERE anchor = '{item[0]}'" + + if not self.item_exists(item): + sql = f"INSERT INTO manifest ({self.columns}) VALUES ('{item[0]}', {item[1]})" + self.query(sql) + + return True \ No newline at end of file diff --git a/src/db/flags.py b/src/db/flags.py new file mode 100644 index 0000000..a555a3e --- /dev/null +++ b/src/db/flags.py @@ -0,0 +1,16 @@ +from .sqlite import SQLite + +class Flags(SQLite): + def __init__(self): + super().__init__() + + self._columns = ["k", "v"] + + @property + def columns(self): + return ",".join(self._columns) + + @columns.setter + def columns(self, columns: list): + self._columns = columns + diff --git a/src/db/sqlite.py b/src/db/sqlite.py new file mode 100644 index 0000000..d15e1a4 --- /dev/null +++ b/src/db/sqlite.py @@ -0,0 +1,69 @@ +import os +import pathlib +import sqlite3 as sqlite + +dbname = "._cloudbackup.db" + +class SQLite(): + def __init__(self): + self.db = sqlite.connect(self.get_db_path()) + self.cursor = self.db.cursor() + + # Check if the database requires configuration + try: + db_exists = self.get_flag("INIT") + if not db_exists: + self.configure_db() + except sqlite.OperationalError: + self.configure_db() + + # Strip linebreaks from pretty-printed SQL + @staticmethod + def format_query(sql: str) -> str: + return " ".join([s.strip() for s in sql.splitlines()]) + + # Run SQL query + def query(self, sql: str): + query = self.cursor.execute(sql) + self.db.commit() + + result = query.fetchall() + if len(result) < 1: + return False + + return result + + # Get path to database file + def get_db_path(self) -> str: + path = os.getenv("SOURCE_FOLDER") + + # Append db file name if absent + if not path.endswith(dbname): + # Append tailing slash if absent + if path[-1] != "/": + path += "/" + path += dbname + return path + + # Prepare a fresh db with the expected table structure + def configure_db(self): + cwd = str(pathlib.Path(__file__).parent.resolve()) + + sql = open(cwd + "/config.sql") + sql_str = SQLite.format_query(sql.read()) + + return self.cursor.executescript(sql_str) + + # Get value from flag by key or .env override + def get_flag(self, key: str) -> bool: + # Return environment variable override + envar = os.getenv(key) + if envar: + return envar + + sql = f"SELECT v FROM flags WHERE k = '{key}'" + res = self.query(sql) + + if not res: + return False + return True diff --git a/src/fs/__init__.py b/src/fs/__init__.py new file mode 100644 index 0000000..49dfed6 --- /dev/null +++ b/src/fs/__init__.py @@ -0,0 +1,2 @@ +from .utils import file_exists, get_parent, get_file +from .fs import FileSystem \ No newline at end of file diff --git a/src/fs/fs.py b/src/fs/fs.py new file mode 100644 index 0000000..00314d9 --- /dev/null +++ b/src/fs/fs.py @@ -0,0 +1,58 @@ +import os +import zlib +import shutil +import tempfile + +from ..db import dbname +from .utils import file_exists, get_parent, get_file + +class FileSystem: + def __init__(self): + self.path = FileSystem.get_path() + + @staticmethod + def get_path() -> str: + return os.getenv("SOURCE_FOLDER") + + # Calculate a CRC32 checksum of provided data + @staticmethod + def chksum(data: str) -> str: + encoded = data.encode("utf-8") + return zlib.crc32(encoded) + + @staticmethod + def delete(path: str) -> bool: + return os.remove(path) + + @staticmethod + def zip(item) -> str: + dest = f"{tempfile.gettempdir()}/{str(item[1])}" + + # Make a temp zip file of single file or folder + if file_exists(item[0]): + return shutil.make_archive(dest, "zip", get_parent(item[0]), get_file(item[0])) + return shutil.make_archive(dest, "zip", item[0]) + + # Get metadata from candidate file or folder + def get_item(self, path: str) -> tuple: + # Ignore SQLite temp files + if path.endswith(".db-journal"): + return False + + mtime = os.path.getmtime(path) + chksum = FileSystem.chksum(path + str(mtime)) + + data = (path, chksum) + return data + + # Get all second-level files and folders for path + def all(self) -> list: + content = [os.path.join(self.path, f) for f in os.listdir(self.path)] + items = [] + + for item in content: + data = self.get_item(item) + if data: + items.append(data) + + return items \ No newline at end of file diff --git a/src/fs/utils.py b/src/fs/utils.py new file mode 100644 index 0000000..62d6531 --- /dev/null +++ b/src/fs/utils.py @@ -0,0 +1,15 @@ +import os.path +import ntpath + +# Check if a file exists +def file_exists(file: str) -> bool: + return os.path.isfile(file) + +# Get parent directory of file +def get_parent(path: str) -> str: + return os.path.dirname(path) + +# Get filename from path string +def get_file(path: str) -> str: + head, tail = ntpath.split(path) + return tail or ntpath.basename(head) \ No newline at end of file