diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..c757b87 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,11 @@ +/.git* +/.editorconfig +/*.example.* +/config/ +/log/ +/mongo_data/ +/mysql_data/ +/schedules/ +/Dockerfile +/docker-compose.yml +/run.sh diff --git a/.gitignore b/.gitignore index e1e7224..ca21124 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,8 @@ celerybeat-schedule .AppleDouble .DS_Store .venv + +/log/ +/schedules/ +/mongo_data/ +/mysql_data/ diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..6f10330 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,39 @@ +FROM python:3.9-slim + +ARG DOCKER_USER=dawis + +RUN apt update && apt install -y --no-install-recommends \ + neovim \ + procps \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# Need to downgrade pip to <24.1 since it got stricter with that version and could not install celery anymore +# Error: Ignoring version 4.4.7 of celery since it has invalid metadata +# Please use pip<24.1 if you need to use this version. +# The first 24.1 version was released 2024-05-06 and the last pipenv version before that was 2023.12.1, which was still +# bundled with pip < 24.1 +RUN pip install pipenv==2023.12.1 + +RUN echo "ls -lah \$@" > /usr/bin/ll && chmod 777 /usr/bin/ll + +RUN addgroup --gid 1000 ${DOCKER_USER} && \ + adduser --uid 1000 --gid 1000 --shell /bin/sh --disabled-password --quiet ${DOCKER_USER} + +COPY . /app + +WORKDIR /app + +RUN pipenv install --system + +ENV CELERY_UID=1000 +ENV CELERY_BEAT_SCHEDULEFILE_PATH="/opt/dawis/var/beat-schedules" +ENV CELERY_LOGFILE_PATH="/opt/dawis/logs" +ENV CELERY_LOGLEVEL="info" +ENV CELERY_TIME_LIMIT=600 +ENV CELERY_CONCURRENCY=4 +ENV CELERY_TIMEZONE="Europe/Berlin" +ENV CELERY_BROKER_URL="redis://redis_host:6379" + +CMD ["/app/run.sh"] diff --git a/README.md b/README.md index bfcf159..9c4b070 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,25 @@ Wir unterscheiden bei der Programmierlogik zwischen zwei Ebenen: Die Grundlage für die Verarbeitung von Daten sind URLs, auf die zugegriffen wird. DAWIS beherrscht sogenannte URL-Sets, d.h. eine Menge an URLs, die gemeinsam behandelt werden kann. So kannst Du zum Beispiel ein URL-Set für wichtige Kategorie-Seiten in Deinem Shop anlegen, oder ein URL-Set für verschiedene Produktdetailseiten. Auf diesen kann man dann gemeinsame Operationen wie z.B. die Prüfung, ob der Canonical-Tag noch vorhanden ist, durchführen. DAWIS kann nicht selbstständig eine Website crawlen und „alle URLs“ behandeln. Das ist auch nicht Sinn und Zweck eines Monitoring- und Alerting-Systems. +## Docker Setup + +Um Dawis mit Docker zu starten, sind folgende Schritte notwendig: + +* `docker build -t dawis .` +* `docker compose up -d` + +Unter `http://localhost:4321` können Daten in MongoDB eingesehen werden, unter `http://localhost:8090` Daten in MySQL. + +Die Konfiguration befindet sich im Ordner `config`. Logs sind im Ordner `log` zu finden. + +Beim Starten wird die Datei `delete_me_for_restart` angelegt. Wenn beispielsweise eine Dateifreigabe auf den `config`-Ordner eingerichtet ist, kann Dawis so auf einfache Art und Weise neugestartet werden, nachdem Konfiguration geändert wurde. Dawis empfängt ein SIGINT-Signal und kann kontrolliert seine Prozesse abschließen und neu starten. + +Die Ressourcen-Limits können je nach Bedarf in der `docker-compose.yml` erhöht oder weiter eingeschränkt werden, je nach Performance-Anforderungen und Leistung des Servers. + +### Debugging + +Zum Debuggen kann folgender Befehl ausgeführt werden: `docker compose up --profile debug dawis_debug` + ## Nächste Schritte Was kommt als Nächstes? Wir sammeln nach dem Lean-Prinzip das Feedback weiter ein. Parallel erweitern wir den Funktionsumfang der vorhandenen Module. Im Fokus stehen diese Bereiche: diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..7dc8abb --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,125 @@ +services: + + dawis: + container_name: dawis + image: dawis + environment: + CELERY_LOGLEVEL: info + REDIS_HOST: dawis_redis + CELERY_BROKER_URL: redis://dawis_redis:6379 + volumes: + - ./config:/app/config + - ./log:/opt/dawis/logs + - ./schedules:/opt/dawis/var/beat-schedules + networks: + - dawis + depends_on: + - dawis_mongo + - dawis_mysql + - dawis_redis + healthcheck: + test: "(ls config/delete_me_for_restart && test $(stat -c %Y -- /opt/dawis/logs/worker.log) -gt $(($$EPOCHSECONDS - 300))) || kill 1" + interval: 5s + timeout: 10s + retries: 5 + start_period: 10s + deploy: + resources: + limits: + cpus: "1" + memory: 1G + restart: unless-stopped + + dawis_debug: + container_name: dawis_debug + image: dawis + environment: + CELERY_LOGLEVEL: debug + REDIS_HOST: dawis_redis + CELERY_BROKER_URL: redis://dawis_redis:6379 + command: ["/app/run-debug.sh"] + volumes: + - ./config:/app/config + - ./log-debug:/opt/dawis/logs + networks: + - dawis + depends_on: + - dawis_mongo + - dawis_mysql + - dawis_redis + profiles: + - debug + + dawis_mongo: + container_name: dawis_mongo + image: mongo:4.2 + environment: + MONGO_INITDB_ROOT_USERNAME: root + MONGO_INITDB_ROOT_PASSWORD: dawis + volumes: + - ./mongo_data:/data/db + networks: + - dawis + deploy: + resources: + limits: + cpus: "1" + memory: "1G" + command: --wiredTigerCacheSizeGB 1 + restart: unless-stopped + + dawis_mysql: + container_name: dawis_mysql + image: mariadb:10.6 + environment: + MARIADB_ROOT_PASSWORD: dawis + volumes: + - ./mysql_data:/var/lib/mysql + networks: + - dawis + restart: unless-stopped + + dawis_mongo_gui: + container_name: dawis_mongo_gui + # you need to build this image from https://github.com/arunbandari/mongo-gui yourself + # check out 82550d257650c72deac8a19007a9da83ca685423, if you don't want the openAI support + # the one on https://hub.docker.com/r/ugleiton/mongo-gui has a bug + # https://github.com/arunbandari/mongo-gui/issues/68 + image: mongo-gui + ports: + - "4321:4321" + environment: + - MONGO_URL=mongodb://root:dawis@dawis_mongo:27017 + networks: + - dawis + restart: unless-stopped + + dawis_mysql_gui: + container_name: dawis_mysql_gui + image: phpmyadmin + ports: + - "8090:80" + environment: + - PMA_HOST=dawis_mysql + - PMA_USER=root + - PMA_PASSWORD=dawis + networks: + - dawis + restart: unless-stopped + + dawis_redis: + container_name: dawis_redis + image: redis:7.2 + networks: + - dawis + deploy: + resources: + limits: + cpus: "1" + memory: 1G + command: ["redis-server", "--appendonly", "no", "--maxmemory", "1gb", "--maxmemory-policy", "allkeys-lru"] + restart: unless-stopped + +networks: + dawis: + name: dawis diff --git a/pagespeed_migration.py b/pagespeed_migration.py new file mode 100644 index 0000000..3aa4ede --- /dev/null +++ b/pagespeed_migration.py @@ -0,0 +1,116 @@ +from google.cloud.bigquery.client import Client +from google.cloud.bigquery.dataset import DatasetReference +from google.cloud.bigquery.enums import SqlTypeNames +from google.cloud.bigquery.job import LoadJobConfig, WriteDisposition +from google.cloud.bigquery.schema import SchemaField +from google.cloud.bigquery.table import TimePartitioning, TimePartitioningType +from google.oauth2 import service_account +from os.path import abspath +from typing import Sequence + + +def create_client(project: str, credentials_file: str) -> Client: + credentials = service_account.Credentials.from_service_account_file(abspath(credentials_file)) + return Client(project, credentials) + + +def run_query(query: str, result_function: callable): + query_job = bigquery_client.query(query) + row_iterator = query_job.result(page_size=25000) + + for page in row_iterator.pages: + result_data = [] + + for row in page: + result_item = {} + + for column, value in row.items(): + result_item[column] = value + + result_data.append(result_item) + + result_function(result_data) + + +def upload_pagespeed_data(data: Sequence[dict]): + job_config = LoadJobConfig() + job_config.write_disposition = WriteDisposition.WRITE_APPEND + job_config.time_partitioning = TimePartitioning(type_=TimePartitioningType.DAY, field=SqlTypeNames.DATE) + + loading_experience_schema_fields = ( + SchemaField('cls', SqlTypeNames.INTEGER), + SchemaField('clsGood', SqlTypeNames.FLOAT), + SchemaField('clsMedium', SqlTypeNames.FLOAT), + SchemaField('clsBad', SqlTypeNames.FLOAT), + SchemaField('lcp', SqlTypeNames.INTEGER), + SchemaField('lcpGood', SqlTypeNames.FLOAT), + SchemaField('lcpMedium', SqlTypeNames.FLOAT), + SchemaField('lcpBad', SqlTypeNames.FLOAT), + SchemaField('fcp', SqlTypeNames.INTEGER), + SchemaField('fcpGood', SqlTypeNames.FLOAT), + SchemaField('fcpMedium', SqlTypeNames.FLOAT), + SchemaField('fcpBad', SqlTypeNames.FLOAT), + SchemaField('fid', SqlTypeNames.INTEGER), + SchemaField('fidGood', SqlTypeNames.FLOAT), + SchemaField('fidMedium', SqlTypeNames.FLOAT), + SchemaField('fidBad', SqlTypeNames.FLOAT), + SchemaField('inp', SqlTypeNames.FLOAT), + SchemaField('inpGood', SqlTypeNames.FLOAT), + SchemaField('inpMedium', SqlTypeNames.FLOAT), + SchemaField('inpBad', SqlTypeNames.FLOAT), + ) + + job_config.schema = ( + SchemaField('url', SqlTypeNames.STRING, 'REQUIRED'), + SchemaField('strategy', SqlTypeNames.STRING, 'REQUIRED'), + SchemaField('date', SqlTypeNames.DATETIME, 'REQUIRED'), + SchemaField('statusCode', SqlTypeNames.INTEGER), + SchemaField('cluster', SqlTypeNames.STRING, 'REQUIRED'), + SchemaField('labdata', SqlTypeNames.RECORD, 'REQUIRED', fields=( + SchemaField('cls', SqlTypeNames.FLOAT), + SchemaField('lcp', SqlTypeNames.FLOAT), + SchemaField('fcp', SqlTypeNames.FLOAT), + SchemaField('tbt', SqlTypeNames.FLOAT), + SchemaField('mpfid', SqlTypeNames.FLOAT), + SchemaField('ttfb', SqlTypeNames.FLOAT), + SchemaField('performanceScore', SqlTypeNames.FLOAT), + SchemaField('serverResponseTime', SqlTypeNames.FLOAT), + SchemaField('usesTextCompression', SqlTypeNames.FLOAT), + SchemaField('usesLongCacheTtl', SqlTypeNames.FLOAT), + SchemaField('domSize', SqlTypeNames.FLOAT), + SchemaField('offscreenImages', SqlTypeNames.FLOAT), + SchemaField('usesOptimizedImages', SqlTypeNames.FLOAT), + SchemaField('usesResponsiveImages', SqlTypeNames.FLOAT), + SchemaField('renderBlockingResources', SqlTypeNames.FLOAT), + SchemaField('bootupTime', SqlTypeNames.FLOAT), + SchemaField('mainthreadWorkBreakdown', SqlTypeNames.FLOAT), + )), + SchemaField( + 'originLoadingExperience', + SqlTypeNames.RECORD, + 'REQUIRED', + fields=loading_experience_schema_fields + ), + SchemaField('loadingExperience', SqlTypeNames.RECORD, fields=loading_experience_schema_fields) + ) + + for data_item in data: + data_item['date'] = data_item['date'].strftime('%Y-%m-%dT%H:%M:%S.%f') + + load_job = bigquery_client.load_table_from_json(data, target_table_reference, job_config=job_config) + load_job.result() + + +project = 'project-id' +dataset = 'yourdataset' +source_table = 'old_pagespeed_table' +target_table = 'new_pagespeed_table' + +bigquery_client = create_client(project, './your-credentials-file-path.json') + +target_table_reference = DatasetReference( + project, + dataset +).table(target_table) + +run_query('SELECT * FROM `{dataset}.{table}`'.format(dataset=dataset, table=source_table), upload_pagespeed_data) diff --git a/run-debug.sh b/run-debug.sh new file mode 100755 index 0000000..b722c38 --- /dev/null +++ b/run-debug.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +/usr/local/bin/python3 -u module-debugger.py diff --git a/run.sh b/run.sh new file mode 100644 index 0000000..3eaa558 --- /dev/null +++ b/run.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +usermod -u ${CELERY_UID} dawis + +touch config/delete_me_for_restart +chown dawis:dawis . && chown -R dawis:dawis $(ls -I config) + +CELERY_MAX_MEMORY_PER_CHILD=$(($(cat /sys/fs/cgroup/memory.max)/$CELERY_CONCURRENCY/1000)) +echo Running workers with $CELERY_MAX_MEMORY_PER_CHILD kB max memory. + +/usr/local/bin/python3 -m celery \ + -A dawis \ + worker \ + --uid=${CELERY_UID} \ + --time-limit=${CELERY_TIME_LIMIT} \ + --concurrency=${CELERY_CONCURRENCY} \ + --autoscale=${CELERY_CONCURRENCY},1 \ + --max-memory-per-child=${CELERY_MAX_MEMORY_PER_CHILD} \ + --logfile=${CELERY_LOGFILE_PATH}/worker.log \ + --loglevel=${CELERY_LOGLEVEL} & + +workerPid=$! +echo Started worker with pid $workerPid. + +/usr/local/bin/python3 -m celery \ + -A dawis \ + beat \ + --uid=${CELERY_UID} \ + --logfile=${CELERY_LOGFILE_PATH}/beat.log \ + --schedule=/opt/dawis/var/beat-schedules/schedule.db \ + --max-interval=60 \ + --loglevel=${CELERY_LOGLEVEL} & + +beatPid=$! +echo Started scheduler with pid $beatPid. + +trap "echo Stopping... && kill -2 $beatPid && kill -2 $workerPid && tail --pid=$workerPid -f /dev/null && exit" SIGTERM + +while true; do sleep 1; done