diff --git a/.github/workflows/build_docker.yml b/.github/workflows/build_docker.yml index 0cf2441838..f99831da32 100644 --- a/.github/workflows/build_docker.yml +++ b/.github/workflows/build_docker.yml @@ -3,51 +3,197 @@ on: push: branches: - main + - dev pull_request: - branches: - - main release: types: - published + workflow_dispatch: permissions: {} jobs: - build: + test-e2e: + name: End-to-end test + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + id: setup-buildx + + - name: Build database container + uses: docker/build-push-action@v6 + with: + context: . + file: ./docker/database/Dockerfile + platforms: linux/amd64 + tags: ghcr.io/${{ github.repository_owner }}/augur_database:test + cache-from: type=gha,scope=container-database + cache-to: type=gha,scope=container-database,mode=min + load: true + + - name: Build keyman container + uses: docker/build-push-action@v6 + with: + context: . + file: ./docker/keyman/Dockerfile + platforms: linux/amd64 + tags: ghcr.io/${{ github.repository_owner }}/augur_keyman:test + cache-from: type=gha,scope=container-keyman + cache-to: type=gha,scope=container-keyman,mode=min + load: true + + - name: Build rabbitmq container + uses: docker/build-push-action@v6 + with: + context: . + file: ./docker/rabbitmq/Dockerfile + platforms: linux/amd64 + tags: ghcr.io/${{ github.repository_owner }}/augur_rabbitmq:test + cache-from: type=gha,scope=container-rabbitmq + cache-to: type=gha,scope=container-rabbitmq,mode=min + load: true + + - name: Build backend container + uses: docker/build-push-action@v6 + with: + context: . + file: ./docker/backend/Dockerfile + platforms: linux/amd64 + tags: ghcr.io/${{ github.repository_owner }}/augur_backend:test + cache-from: type=gha,scope=container-backend + cache-to: type=gha,scope=container-backend,mode=min + load: true + + - name: Prepare compose file + run: | + yq eval -i '.services.augur.image = "ghcr.io/${{ github.repository_owner }}/augur_backend:test"' docker-compose.yml + yq eval -i '.services.augur.pull_policy = "never"' docker-compose.yml + yq eval -i '.services.augur.restart = "no"' docker-compose.yml + + yq eval -i '.services.augur-db.image = "ghcr.io/${{ github.repository_owner }}/augur_database:test"' docker-compose.yml + yq eval -i '.services.augur-db.pull_policy = "never"' docker-compose.yml + yq eval -i '.services.augur-db.restart = "no"' docker-compose.yml + + yq eval -i '.services.augur-keyman.image = "ghcr.io/${{ github.repository_owner }}/augur_keyman:test"' docker-compose.yml + yq eval -i '.services.augur-keyman.pull_policy = "never"' docker-compose.yml + yq eval -i '.services.augur-keyman.restart = "no"' docker-compose.yml + + yq eval -i '.services.rabbitmq.image = "ghcr.io/${{ github.repository_owner }}/augur_rabbitmq:test"' docker-compose.yml + yq eval -i '.services.rabbitmq.pull_policy = "never"' docker-compose.yml + yq eval -i '.services.rabbitmq.restart = "no"' docker-compose.yml + + - name: Setup Docker Compose + uses: docker/setup-compose-action@v1 + with: + version: latest + + - name: Set up list of log lines to match + run: | + cat < /tmp/regex_matches.txt + Gunicorn webserver started + Starting core worker processes + Starting secondary worker processes + Starting facade worker processes + Retrieved \\d+ github api keys for use + Fetching new repos \\(complete\\) + Inserting \\d+ contributors + Inserting \\d+ issues + Inserting prs of length: \\d+ + Querying committers count + Done generating scc data for repo + Sending due task + EOF + + - name: Start services & wait for output + # This starts the system and sends the output to "await_all.py" which + # scans for the regex matches from above. Once all matches are seen at + # least once, the `compose down` will run to shut down the system. If + # this all doesn't happen before the timeout, the job will fail. + run: | + docker compose -f docker-compose.yml up --no-build 2>&1 \ + | (./scripts/ci/await_all.py /tmp/regex_matches.txt \ + && docker compose -f docker-compose.yml down) + timeout-minutes: 3 + env: + AUGUR_GITHUB_API_KEY: ${{ secrets.GITHUB_TOKEN }} + AUGUR_GITHUB_USERNAME: ${{ github.repository_owner }} + AUGUR_GITLAB_API_KEY: dummy + AUGUR_GITLAB_USERNAME: dummy + + - name: Dump logs + # Always run this step to get logs, even if the previous step fails + if: always() + # We use tail so that we can see the name of each file as it's printed + run: "docker run -t --rm -v augur_logs:/logs bash -c 'find /logs -type f | xargs tail -n +0'" + + + + push-image: + name: Push image + needs: test-e2e + # We don't push images on pull requests + if: github.event_name != 'pull_request' permissions: contents: read # to fetch code (actions/checkout) packages: write # to push docker image - - name: Build image + strategy: + matrix: + image: + - backend + - database + - rabbitmq runs-on: ubuntu-latest steps: - - name: Checkout main - uses: actions/checkout@v2 - - name: Run the build - run: | - set -ex - # use that here since the variable are not present before start, so can't be in env - export LOGIN=$GITHUB_REPOSITORY_OWNER - - echo $PASSWORD | docker login $REGISTRY -u $LOGIN --password-stdin - - for i in docker/* ; do - CONTAINER=$(basename $i) - echo "Building $CONTAINER" - export IMAGE=$LOGIN/augur_$CONTAINER - DOCKERFILE=${i}/Dockerfile - - docker build . -f $DOCKERFILE --tag $REGISTRY/$IMAGE:latest - if [[ $GITHUB_EVENT_NAME == 'release' ]]; then - TAG=$(basename $GITHUB_REF) - docker tag $REGISTRY/$IMAGE:latest $REGISTRY/$IMAGE:$TAG - docker push $REGISTRY/$IMAGE:latest - docker push $REGISTRY/$IMAGE:$TAG - elif [[ $GITHUB_EVENT_NAME == 'push' ]]; then - docker tag $REGISTRY/$IMAGE:latest $REGISTRY/$IMAGE:devel-latest - docker push $REGISTRY/$IMAGE:devel-latest - fi - done + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + id: setup-buildx + + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + if: github.event_name != 'pull_request' + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Set container metadata + uses: docker/metadata-action@v5 + id: meta env: - REGISTRY: ghcr.io - PASSWORD: ${{ secrets.GITHUB_TOKEN }} + DOCKER_METADATA_ANNOTATIONS_LEVELS: index,manifest + with: + annotations: | + org.opencontainers.image.title=augur_${{ matrix.image}} + labels: | + org.opencontainers.image.title=augur_${{ matrix.image}} + images: ghcr.io/${{ github.repository_owner }}/augur_${{ matrix.image }} + # Pushes to the dev branch update the *:devel-latest tag + # Releases update the *:latest tag and the *: tag + # Main does not update any tags + tags: | + type=raw,value=devel-latest,enable=${{ github.ref == 'refs/heads/dev' }} + type=raw,value=latest,enable=${{ github.event_name == 'release' }} + type=raw,value=${{ github.event.release.tag_name }},enable=${{ github.event_name == 'release' }} + + - name: Build and push + id: push + uses: docker/build-push-action@v6 + with: + annotations: ${{ steps.meta.outputs.annotations }} + context: . + file: ./docker/${{ matrix.image }}/Dockerfile + labels: ${{ steps.meta.outputs.labels }} + platforms: linux/amd64 + # Only push if we've tagged the image in the metadata step + push: ${{ steps.meta.outputs.tags != '' }} + tags: ${{ steps.meta.outputs.tags }} + # Use the same cache as the build step + cache-from: type=gha,scope=container-${{ matrix.image }} + cache-to: type=gha,scope=container-${{ matrix.image }},mode=min diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index c23bfd7bb3..69b2fe7e22 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -28,4 +28,15 @@ jobs: uses: reviewdog/action-misspell@v1 with: github_token: ${{ secrets.GITHUB_TOKEN }} - locale: "US" \ No newline at end of file + locale: "US" + + uv-lock: + name: runner / uv-lock + runs-on: ubuntu-latest + steps: + - name: Check out repository + uses: actions/checkout@v4 + - name: Install uv + uses: astral-sh/setup-uv@v6 + - name: Ensure uv lockfile is up to date + run: uv lock --check diff --git a/.gitignore b/.gitignore index 46e492cd47..93be721ef1 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,8 @@ augur_export_env.sh !docker.config.json config.yml reports.yml +*.pid +*.sock node_modules/ .idea/ @@ -104,10 +106,6 @@ target/ profile_default/ ipython_config.py -# pyenv -.python-version -.python-version-hash - # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies diff --git a/.pylintrc b/.pylintrc index aec2f59d4c..c319333602 100644 --- a/.pylintrc +++ b/.pylintrc @@ -3,16 +3,6 @@ # go here to check pylint codes if not explained #https://vald-phoenix.github.io/pylint-errors/ -#doc string checkers -#enable=C0112,C0114,C0115,C0116 - -# checks for black listed names being used -#enable=C0102 - -#refactoring checker -#enable=R - -disable=E0611,E1101,W1203,R0801,W0614,W0611,C0411,C0103,C0301,C0303,C0304,C0305,W0311,E0401,C0116 # Analyse import fallback blocks. This can be used to support both Python 2 and @@ -150,29 +140,9 @@ confidence=HIGH, INFERENCE_FAILURE, UNDEFINED -# Disable the message, report, category or checker with the given id(s). You -# can either give multiple identifiers separated by comma (,) or put this -# option multiple times (only on the command line, not in the configuration -# file where it should appear only once). You can also use "--disable=all" to -# disable everything first and then re-enable specific checks. For example, if -# you want to run only the similarities checker, you can use "--disable=all -# --enable=similarities". If you want to run only the classes checker, but have -# no Warning level messages displayed, use "--disable=all --enable=classes -# --disable=W". -disable=raw-checker-failed, - bad-inline-option, - locally-disabled, - file-ignored, - suppressed-message, - useless-suppression, - deprecated-pragma, - use-symbolic-message-instead - -# Enable the message, report, category or checker with the given id(s). You can -# either give multiple identifier separated by comma (,) or put this option -# multiple time (only on the command line, not in the configuration file where -# it should appear only once). See also the "--disable" option for examples. -enable=c-extension-no-member +# Only enable specific messages +disable=all +enable=unused-import,redefined-outer-name,E1206,E1205,E0704,E0107,E4702,E1101,E0211,E0213,E0103,E1133,E1120,E3102,E0602,E1123,E0001,W0702,W1404,W0706,W0101,W0120,W0718,R1737,R1705,R1720,R1724,R1723,R0401,R1701,C1802,C0200,C0501,C0201,W1001,E1102,R0923 [LOGGING] diff --git a/.python-version b/.python-version new file mode 100644 index 0000000000..2c0733315e --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.11 diff --git a/.readthedocs.yml b/.readthedocs.yml index 3b0e387327..120fd08332 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -5,10 +5,15 @@ # Required version: 2 -build: +build: os: ubuntu-22.04 # <- add this line tools: python: "3.10" + jobs: + post_create_environment: + # Use uv to create a requirements file that RTD can install + - pip install uv + - uv export --format requirements.txt --only-group docs -o requirements.txt # Build documentation in the docs/ directory with Sphinx sphinx: @@ -24,12 +29,8 @@ formats: all # Optionally set the version of Python and requirements required to build your docs python: install: - - method: pip - path: . - extra_requirements: - - dev - - method: setuptools - path: . + # Install the requirements file created during the post_create_environment job + - requirements: requirements.txt # build: # os: ubuntu-22.04 diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000000..e26f3d8a86 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,14 @@ +# This CITATION.cff reference content was generated from Zotero. +cff-version: 1.2.0 +message: "If you use this software, please cite it as below." +authors: + - family-names: Goggins + given-names: Sean + - family-names: Lumbard + given-names: Kevin + - family-names: Germonprez + given-names: Matt +title: "Open Source Community Health: Analytical Metrics and Their Corresponding Narratives" +doi: 10.1109/SoHeal52568.2021.00010 +date-released: 2021 +url: https://www.seangoggins.net/wp-content/plugins/zotpress/lib/request/request.dl.php?api_user_id=655145&dlkey=HNG22ZSU&content_type=application/pdf diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8b1e07b609..6fed03731b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,17 +1,32 @@ # How to Contribute -We love to pull requests from everyone! We follow the standard Git workflow of `fork -> change -> pull request -> merge -> update fork -> change ... (repeat forever)`. If you are new to open source, we recommend GitHub's excellent guide on "[How to Contribute to Open Source](https://opensource.guide/how-to-contribute/)". In addition, please feel free to reach out to any of the maintainers or other community members if you are struggling; we are here to help you learn! +We love to pull requests from everyone! We follow the standard Git workflow of `fork -> change -> pull request -> merge -> update fork -> change ... (repeat forever)`. -Before getting started, please make sure you've read the [README](README.md) to get a primer on our project. Augur's documentation can be found at: https://oss-augur.readthedocs.io/en/main/ +If you are new to open source, we recommend GitHub's excellent guide on "[How to Contribute to Open Source](https://opensource.guide/how-to-contribute/)". In addition, please feel free to reach out to any of the maintainers or other community members if you are struggling as we are here to help you learn! + +Before getting started, please make sure you've read the [README](README.md) to get a primer on our project. Augur's documentation can be found [here](https://oss-augur.readthedocs.io/en/main/). ## Opening an issue If you're experiencing an issue with Augur or have a question you'd like help answering, please feel free to open an [issue](https://github.com/chaoss/augur/issues). To help us prevent duplicates, we kindly ask that you briefly search for your problem or question in our [issues](https://github.com/chaoss/augur/issues) before opening a new one. -Please note that if you open a bug report and your issue does not follow our template, we cannot help you until you have provided us all the relevant information in that format. Respectfully, we do not have the time to try and recreate an error given with minimal or no context, so by providing this information you are helping us help you! You will see this template when you open an issue; click on "Bug Report" and it will be populated with descriptions of what to put in each section. Replace the descriptions with your comments to the best of your ability, and please include screenshots and error logs if applicable. +Please note that if you open a bug report and your issue **does not** follow our template, we cannot help you until you have provided us all the relevant information in that format. +Respectfully, we do not have the time to try and recreate an error given with minimal or no context, so by providing this information you are helping us help you! + +### How to submit a bug report +To see the template referred to in the above section, click on **New Issue**, then click on the **Get Started** button on the **Bug Report** option. +A dialogue box populated with descriptions of what to put in each section, will pop up on a new page. +Kindly replace the descriptions with your comments to the best of your ability, and please include screenshots and error logs if applicable. + +file1 + +file2 + +file3 + ## Contributing to the source code -1. Fork this repo, and then clone it: +1. Fork and clone this repo: ```bash $ git clone github.com:your-username/augur.git $ cd augur/ @@ -35,45 +50,41 @@ $ git push -u origin my-new-branch ``` 6. Then, [submit a pull request](https://github.com/chaoss/augur/compare). -At this point, you're waiting on us. We like to at least comment on pull requests -within three business days (and, typically, one business day). Once one of our maintainers has had a chance to review your PR, we will either mark it as "needs review" and provide specific feedback on your changes, or we will go ahead and complete the pull request. +At this point, you're waiting on us. We like to at least comment on pull requests within three business days (and, typically, one business day). +Once one of our maintainers has had a chance to review your PR, we will either mark it as ```needs review``` and provide specific feedback on your changes, or we will go ahead and complete the pull request. ## Signing-off on Commits -To contribute to this project, you must agree to the [Developer Certificate of Origin](https://developercertificate.org/) by the [CHAOSS charter](https://chaoss.community/about/charter/#user-content-8-intellectual-property-policy) for each commit you make. The DCO is a simple statement that you, as a contributor, have the legal right to make the contribution. -To signify that you agree to the DCO for contributions, you simply add a line to each of your -git commit messages: +To contribute to this project, you must agree to the [Developer Certificate of Origin](https://developercertificate.org/) (DCO) by the [CHAOSS charter](https://chaoss.community/about/charter/#user-content-8-intellectual-property-policy) for each commit you make. The DCO is a simple statement that you, as a contributor, have the legal right to make the contribution. +To signify that you agree to the DCO for contributions, you simply add a line to each of your git commit messages. For example: +``` +Signed-off-by: Jane Smith +``` - ``` - Signed-off-by: Jane Smith - ``` -This can be easily done by using the `-s` flag when using `git commit`. For example: +This can be easily done by using the `-s` flag when running the `git commit` command, ``` $ git commit -s -m “my commit message w/signoff” ``` + To ensure all your commits are signed, you may choose to [configure git](https://gist.github.com/xavierfoucrier/c156027fcc6ae23bcee1204199f177da) properly by editing your global ```.gitconfig``` **Any pull requests containing commits that are not signed off will not be eligible for merge until the commits have been signed off.** ## Keeping in sync with the Augur Repository -Remeber to sync your fork with the main branch regularly. -To do this: +Remember to sync your fork with the ```main``` branch regularly, by taking the following steps: -Go to github and copy the url of the main Augur repo - ``` - https://github.com/chaoss/augur.git - ``` - make sure to be in the rootfolder of the project and the branch should be master branch and type - ``` - git remote add upstream https://github.com/chaoss/augur.git - ``` - Now you have your upstream setup in your local machine,whenever you need to make a new branch for making changes make sure your main branch is in sync with the main repository, to do this,make sure to be in the main branch and type +- Setup your upstream branch to point to the URL of the main Augur repo ```https://github.com/chaoss/augur.git```. - ``` - git pull upstream master - git push origin master - ``` +- Next, in the root folder of the project, on the ```main``` branch, run: +``` +git remote add upstream https://github.com/chaoss/augur.git +``` +Whenever you need to make changes, make sure your ```main``` branch is in sync with the main repository, by checking out to the ```main``` branch and running: +``` +git pull upstream main +git push origin master +``` ## Community Resources @@ -81,7 +92,7 @@ Go to github and copy the url of the main Augur repo ### Augur - [Stable documentation (`main` branch)](https://oss-augur.readthedocs.io/en/main/) - [Nightly/developer build documentation (`dev` branch)](https://oss-augur.readthedocs.io/en/dev/) (warning: this is should be considered an unstable branch and should not be used for production) -- [Live Augur demo](http://zephyr.osshealth.io/) +- [Live Augur demo](https://ai.chaoss.io) ### CHAOSS - [Website](https://chaoss.community/) diff --git a/Makefile b/Makefile index 22364ac160..4fe926edc4 100644 --- a/Makefile +++ b/Makefile @@ -3,6 +3,7 @@ default: @ echo "Installation Commands:" @ echo " install Installs Augur's full stack for production" + @ echo " wizard Install Augur and launch the graphical setup wizard" @ echo " clean Removes potentially troublesome compiled files" @ echo " rebuild Removes build/compiled files & binaries and reinstalls the project" @ echo @@ -31,8 +32,11 @@ default: .PHONY: install .PHONY: install-spdx install-spdx-sudo install-augur-sbom .PHONY: clean rebuild -install: - @ ./scripts/install/install.sh dev +install: uv + @ uv run ./scripts/install/install.sh dev + +wizard: + @ ./scripts/install/install.sh graphical install-spdx: @ ./scripts/install/install-spdx.sh @@ -46,8 +50,8 @@ install-augur-sbom: clean: @ scripts/control/clean.sh -rebuild: - @ scripts/control/rebuild.sh dev +rebuild: uv + @ uv run scripts/control/rebuild.sh dev # # Development @@ -120,12 +124,20 @@ test-api: # @ bash -c 'tox -e ALL' +# +# UV installation +# +.PHONY: uv +uv: + @ command -v uv >/dev/null 2>&1 || { echo "Installing uv..."; pip install --user uv; } + # # Documentation # .PHONY: docs docs-view -docs: - @ bash -c 'cd docs/ && rm -rf build/ && make html;' +docs: uv + -rm -rf docs/build + uv run --only-group docs make -C docs html docs-view: docs @ bash -c 'open docs/build/html/index.html' @@ -180,4 +192,4 @@ docker-run-database: docker-run-rabbitmq: @ - docker stop augur_rabbitmq @ - docker rm augur_rabbitmq - docker run -p 5434:5432 --name augur_rabbitmq augurlabs/augur:rabbitmq \ No newline at end of file + docker run -p 5434:5432 --name augur_rabbitmq augurlabs/augur:rabbitmq diff --git a/README.md b/README.md index f760fa3b2c..ff7480b702 100644 --- a/README.md +++ b/README.md @@ -1,32 +1,39 @@ -# Augur NEW Release v0.62.6 +# Augur NEW Release v0.86.1 -Augur is primarily a data engineering tool that makes it possible for data scientists to gather open source software community data. Less data carpentry for everyone else! -The primary way of looking at Augur data is through [8Knot](https://github.com/oss-aspen/8knot) ... A public instance of 8Knot is available at https://metrix.chaoss.io ... That is tied to a public instance of Augur at https://ai.chaoss.io +Augur is primarily a data engineering tool that makes it possible for data scientists to gather open source software community data - less data carpentry for everyone else! +The primary way of looking at Augur data is through [8Knot](https://github.com/oss-aspen/8knot), a public instance of 8Knot is available [here](https://metrix.chaoss.io) - this is tied to a public instance of [Augur](https://ai.chaoss.io). -[![first-timers-only](https://img.shields.io/badge/first--timers--only-friendly-blue.svg?style=flat-square)](https://www.firsttimersonly.com/) We follow the [First Timers Only](https://www.firsttimersonly.com/) philosophy of tagging issues for first timers only, and walking one newcomer through the resolution process weekly. [You can find these issues tagged with "first timers only" on our issues list.](https://github.com/chaoss/augur/labels/first-timers-only). +[![first-timers-only](https://img.shields.io/badge/first--timers--only-friendly-blue.svg?style=flat-square)](https://www.firsttimersonly.com/) +We follow the [First Timers Only](https://www.firsttimersonly.com/) philosophy of tagging issues for first timers only, and walking one newcomer through the resolution process weekly. You can find these issues tagged with [first timers only](https://github.com/chaoss/augur/labels/first-timers-only) on our issues list. [![standard-readme compliant](https://img.shields.io/badge/standard--readme-OK-green.svg?style=flat-square)](https://github.com/RichardLitt/standard-readme) [![Build Docker images](https://github.com/chaoss/augur/actions/workflows/build_docker.yml/badge.svg)](https://github.com/chaoss/augur/actions/workflows/build_docker.yml) [![Hits-of-Code](https://hitsofcode.com/github/chaoss/augur?branch=main)](https://hitsofcode.com/github/chaoss/augur/view?branch=main) [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/2788/badge)](https://bestpractices.coreinfrastructure.org/projects/2788) ## NEW RELEASE ALERT! -### [If you want to jump right in, updated docker build/compose and bare metal installation instructions are available here](docs/new-install.md) +**If you want to jump right in, the updated docker, docker-compose and bare metal installation instructions are available [here](docs/new-install.md)**. + +Augur is now releasing a dramatically improved new version to the ```main``` branch. It is also available [here](https://github.com/chaoss/augur/releases/tag/v0.86.1). + -Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.62.6 - The `main` branch is a stable version of our new architecture, which features: - Dramatic improvement in the speed of large scale data collection (100,000+ repos). All data is obtained for 100k+ repos within 2 weeks. - - A new job management architecture that uses Celery and Redis to manage queues, and enables users to run a Flower job monitoring dashboard - - Materialized views to increase the snappiness of API’s and Frontends on large scale data - - Changes to primary keys, which now employ a UUID strategy that ensures unique keys across all Augur instances - - Support for https://github.com/oss-aspen/8knot dashboards (view a sample here: https://eightknot.osci.io/). (beautification coming soon!) - - Data collection completeness assurance enabled by a structured, relational data set that is easily compared with platform API Endpoints -- The next release of the new version will include a hosted version of Augur where anyone can create an account and add repos “they care about”. If the hosted instance already has a requested organization or repository it will be added to a user’s view. If its a new repository or organization, the user will be notified that collection will take (time required for the scale of repositories added). + - A new job management architecture that uses Celery and Redis to manage queues, and enables users to run a Flower job monitoring dashboard. + - Materialized views to increase the snappiness of API’s and Frontends on large scale data. + - Changes to primary keys, which now employ a UUID strategy that ensures unique keys across all Augur instances. + - Support for [8knot](https://github.com/oss-aspen/8kno) dashboards (view a sample [here](https://eightknot.osci.io/)). + *beautification coming soon!* + - Data collection completeness assurance enabled by a structured, relational data set that is easily compared with platform API Endpoints. +- The next release of the new version will include a hosted version of Augur where anyone can create an account and add repos *they care about*. +If the hosted instance already has a requested organization or repository it will be added to a user’s view. If its a new repository or organization, the user will be notified that collection will take (time required for the scale of repositories added). ## What is Augur? Augur is a software suite for collecting and measuring structured data about [free](https://www.fsf.org/about/) and [open-source](https://opensource.org/docs/osd) software (FOSS) communities. We gather trace data for a group of repositories, normalize it into our data model, and provide a variety of metrics about said data. The structure of our data model enables us to synthesize data across various platforms to provide meaningful context for meaningful questions about the way these communities evolve. -Augur’s main focus is to measure the overall health and sustainability of open source projects, as these types of projects are system critical for nearly every software organization or company. We do this by gathering data about project repositories and normalizing that into our data model to provide useful metrics about your project’s health. For example, one of our metrics is Burstiness. Burstiness – how are short timeframes of intense activity, followed by a corresponding return to a typical pattern of activity, observed in a project? +Augur’s main focus is to measure the overall health and sustainability of open source projects, as these types of projects are system critical for nearly every software organization or company. We do this by gathering data about project repositories and normalizing that into our data model to provide useful metrics about your project’s health. + +For example, one of our metrics is *burstiness*. Burstiness – how are short timeframes of intense activity, followed by a corresponding return to a typical pattern of activity, observed in a project? This can paint a picture of a project’s focus and gain insight into the potential stability of a project and how its typical cycle of updates occurs. We are a [CHAOSS](https://chaoss.community) project, and many of our @@ -36,39 +43,53 @@ For more information on [how to get involved on the CHAOSS website](https://chao ## Collecting Data -Augur supports Python3.6 through Python3.9 on all platforms. Python3.10 and above do not yet work because of machine learning worker dependencies. On OSX, you can create a Python 3.9 environment this way: `python3.9 -m venv path/to/venv`. +Augur supports ```Python3.7``` through ```Python3.11``` on all platforms. ```Python3.12``` and above do not yet work because of machine learning worker dependencies. On OSX, you can create a ```Python3.11``` environment, by running: +``` +$ python3.11 -m venv path/to/venv +``` Augur's main focus is to measure the overall health and sustainability of open source projects. Augur collects more data about open source software projects than any other available software. Augur's main focus is to measure the overall health and sustainability of open source projects. -One of Augur's core tenets is a desire to openly gather data that people can trust, and then provide useful and well-defined metrics that help give important context to the larger stories being told by that data. We do this in a variety of ways, one of which is doing all our own data collection in house. We currently collect data from a few main sources: + +One of Augur's core tenets is a desire to openly gather data that people can trust, and then provide useful and well-defined metrics that help give important context to the larger stories being told by that data. + +We do this in a variety of ways, one of which is doing all our own data collection in house. We currently collect data from a few main sources: 1. Raw Git commit logs (commits, contributors) 2. GitHub's API (issues, pull requests, contributors, releases, repository metadata) 3. The Linux Foundation's [Core Infrastructure Initiative](https://www.coreinfrastructure.org/) API (repository metadata) 4. [Succinct Code Counter](https://github.com/boyter/scc), a blazingly fast Sloc, Cloc, and Code tool that also performs COCOMO calculations -This data is collected by dedicated data collection workers controlled by Augur, each of which is responsible for querying some subset of these data sources. We are also hard at work building workers for new data sources. If you have an idea for a new one, [please tell us](https://github.com/chaoss/augur/issues/new?template=feature_request.md) - we'd love your input! +This data is collected by dedicated data collection workers controlled by Augur, each of which is responsible for querying some subset of these data sources. +We are also hard at work building workers for new data sources. If you have an idea for a new one, [please tell us](https://github.com/chaoss/augur/issues/new?template=feature_request.md) - we'd love your input! ## Getting Started If you're interested in collecting data with our tool, the Augur team has worked hard to develop a detailed guide to get started with our project which can be found [in our documentation](https://oss-augur.readthedocs.io/en/main/getting-started/toc.html). -If you're looking to contribute to Augur's code, you can find installation instructions, development guides, architecture references (coming soon), best practices and more in our [developer documentation](https://oss-augur.readthedocs.io/en/main/development-guide/toc.html). Please know that while it's still rather sparse right now, -but we are actively adding to it all the time. If you get stuck, please feel free to [ask for help](https://github.com/chaoss/augur/issues/new)! +If you're looking to contribute to Augur's code, you can find installation instructions, development guides, architecture references (coming soon), best practices and more in our [developer documentation](https://oss-augur.readthedocs.io/en/main/development-guide/toc.html). + +Please know that while it's still rather sparse right now, +but we are actively adding to it all the time. + +If you get stuck, please feel free to [ask for help](https://github.com/chaoss/augur/issues/new)! ## Contributing -To contribute to Augur, please follow the guidelines found in our [CONTRIBUTING.md](CONTRIBUTING.md) and our [Code of Conduct](CODE_OF_CONDUCT.md). Augur is a welcoming community that is open to all, regardless if you're working on your 1000th contribution to open source or your 1st. We strongly believe that much of what makes open source so great is the incredible communities it brings together, so we invite you to join us! +To contribute to Augur, please follow the guidelines found in our [CONTRIBUTING.md](CONTRIBUTING.md) and our [Code of Conduct](CODE_OF_CONDUCT.md). Augur is a welcoming community that is open to all, regardless if you're working on your 1000th contribution to open source or your 1st. +We strongly believe that much of what makes open source so great is the incredible communities it brings together, so we invite you to join us! ## License, Copyright, and Funding -Copyright © 2023 University of Nebraska at Omaha, University of Missouri, Brian Warner, and the CHAOSS Project. +Copyright © 2025 University of Nebraska at Omaha, University of Missouri, Brian Warner, and the CHAOSS Project. Augur is free software: you can redistribute it and/or modify it under the terms of the MIT License as published by the Open Source Initiative. See the [LICENSE](LICENSE) file for more details. -This work has been funded through the Alfred P. Sloan Foundation, Mozilla, The Reynolds Journalism Institute, contributions from VMWare, Red Hat Software, Grace Hopper's Open Source Day, GitHub, Microsoft, Twitter, Adobe, the Gluster Project, Open Source Summit (NA/Europe), and the Linux Foundation Compliance Summit. Significant design contributors include Kate Stewart, Dawn Foster, Duane O'Brien, Remy Decausemaker, others omitted due to the memory limitations of project maintainers, and 15 Google Summer of Code Students. +This work has been funded through the Alfred P. Sloan Foundation, Mozilla, The Reynolds Journalism Institute, contributions from VMWare, Red Hat Software, Grace Hopper's Open Source Day, GitHub, Microsoft, Twitter, Adobe, the Gluster Project, Open Source Summit (NA/Europe), and the Linux Foundation Compliance Summit. + +Significant design contributors include Kate Stewart, Dawn Foster, Duane O'Brien, Remy Decausemaker, others omitted due to the memory limitations of project maintainers, and 15 Google Summer of Code Students. Current maintainers -------------------- @@ -78,8 +99,6 @@ Current maintainers - `John McGinnis `_ - `Sean P. Goggins `_ - - Former maintainers -------------------- - `Carter Landis `_ @@ -133,4 +152,3 @@ GSoC 2019 participants GSoC 2018 participants ----------------------- - `Keanu Nichols `_ - diff --git a/Vagrantfile b/Vagrantfile deleted file mode 100644 index 95de27ce02..0000000000 --- a/Vagrantfile +++ /dev/null @@ -1,108 +0,0 @@ -$script = <<-'SCRIPT' -set -euxo pipefail - -sudo apt-get -y update -sudo apt-get -y install --no-install-recommends \ - build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev libsqlite3-dev libreadline-dev libffi-dev curl libbz2-dev \ - git gcc gfortran \ - python3 python3-pip python3.8-venv \ - postgresql postgresql-contrib \ - libomp-dev \ - golang libgomp1 -sudo pg_ctlcluster 12 main start - -go get -u github.com/boyter/scc/ - -# # install Go -# installGo() ( -# cd "$(mktemp -d)" -# wget https://golang.org/dl/go1.16.5.linux-amd64.tar.gz -# rm -rf /usr/local/go && tar -C /usr/local -xzf go1.16.5.linux-amd64.tar.gz -# ) -# sudo installGo -# export PATH=$PATH:/usr/local/go/bin - - -########################################################################################## -# see: https://oss-augur.readthedocs.io/en/master/getting-started/database.html -cat < /tmp/init.psql -CREATE DATABASE augur; -CREATE USER augur WITH ENCRYPTED PASSWORD 'password'; -GRANT ALL PRIVILEGES ON DATABASE augur TO augur; -EOF -sudo -u postgres psql -U postgres -f /tmp/init.psql - - -########################################################################################## -# see: https://oss-augur.readthedocs.io/en/master/getting-started/installation.html - -mkdir -p "$HOME/augur/" "$HOME/augur/logs/" "$HOME/augur/repos/" -cat < "$HOME/augur/config.json" -{ - "Database": { - "host": "localhost", - "password": "password" - }, - "Server": { - "host": "0.0.0.0" - }, - "Logging": { - "logs_directory": "$HOME/augur/logs/", - "log_level": "INFO", - "verbose": 0, - "quiet": 0, - "debug": 1 - }, - "Workers": { - "facade_worker": { - "repo_directory": "$HOME/augur/repos/", - "switch": 1 - }, - "github_worker": { - "switch": 1 - }, - "insight_worker": { - "switch": 1 - }, - "linux_badge_worker": { - "switch": 1 - }, - "pull_request_worker": { - "switch": 1 - }, - "repo_info_worker": { - "switch": 1 - }, - "release_worker": { - "switch": 1 - } - } -} -EOF - - -python3 -m venv $HOME/.virtualenvs/augur_env -source $HOME/.virtualenvs/augur_env/bin/activate -pip install wheel - -cd /vagrant -python setup.py bdist_wheel -make clean -make install-dev - -augur config init --rc-config-file "$HOME/config.json" -augur db create-schema -augur backend start" - -SCRIPT - -Vagrant.configure("2") do |config| - config.vm.box = "ubuntu/focal64" - - config.vm.provider "virtualbox" do |v| - v.memory = 20480 - v.cpus = 4 - end - - config.vm.provision "shell", privileged: false, inline: $script -end diff --git a/add.md b/add.md deleted file mode 100644 index eaf2a3fac6..0000000000 --- a/add.md +++ /dev/null @@ -1 +0,0 @@ -dfadffd diff --git a/augur/api/metrics/deps.py b/augur/api/metrics/deps.py index 4f708cc40e..486fd1ec49 100644 --- a/augur/api/metrics/deps.py +++ b/augur/api/metrics/deps.py @@ -48,7 +48,7 @@ def deps(repo_group_id, repo_id=None, period='day', begin_date=None, end_date=No """) with current_app.engine.connect() as conn: - results = pd.read_sql(depsSQL, conn) + results = pd.read_sql(depsSQL, conn, params={'repo_id': repo_id}) else: @@ -73,8 +73,166 @@ def deps(repo_group_id, repo_id=None, period='day', begin_date=None, end_date=No """) with current_app.engine.connect() as conn: - results = pd.read_sql(depsSQL, conn) + results = pd.read_sql(depsSQL, conn, params={'repo_group_id': repo_group_id}) return results +@register_metric() +def libyear(repo_group_id, repo_id=None, period='day', begin_date=None, end_date=None): + """ + Returns a list of all the dependencies in a project/repo/repo_group. + + :param repo_id: The repository's id + :param repo_group_id: The repository's group id + :param period: To set the periodicity to 'day', 'week', 'month' or 'year', defaults to 'day' + :param begin_date: Specifies the begin date, defaults to '1970-1-1 00:00:00' + :param end_date: Specifies the end date, defaults to datetime.now() + :return: DataFrame of persons/period + """ + + if not begin_date: + begin_date = '1970-1-1 00:00:01' + if not end_date: + end_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + + if repo_id: + + libyearSQL = s.sql.text(""" + SELECT + rg_name, + repo_group_id, + repo_name, + d.repo_id, + repo_git, + forked_from, + repo_archived, + c.name, + c.libyear, + MAX ( C.data_collection_date ) AS most_recent_collection + FROM + ( + SELECT A.rg_name AS rg_name, + A.repo_group_id AS repo_group_id, + b.repo_name AS repo_name, + b.repo_id AS repo_id, + b.repo_git AS repo_git, + b.forked_from AS forked_from, + b.repo_archived AS repo_archived + FROM + repo_groups A, + repo b + WHERE + A.repo_group_id = b.repo_group_id + ORDER BY + rg_name, + repo_name + ) d, + ( + SELECT DISTINCT + f.repo_id, + f.NAME, + f.libyear, + f.data_collection_date + FROM + ( SELECT repo_id, NAME, MAX ( data_collection_date ) AS data_collection_date FROM augur_data.repo_deps_libyear WHERE repo_id = :repo_id GROUP BY repo_id, NAME ORDER BY NAME ) e, + augur_data.repo_deps_libyear f + WHERE + e.data_collection_date = f.data_collection_date and + e.repo_id = f.repo_id + ORDER BY + NAME + ) C + WHERE + d.repo_id = C.repo_id + AND C.repo_id = :repo_id + GROUP BY + rg_name, + repo_git, + repo_group_id, + repo_name, + d.repo_id, + forked_from, + repo_archived, + c.name, + c.libyear + ORDER BY + repo_id; + """) + + with current_app.engine.connect() as conn: + results = pd.read_sql(libyearSQL, conn, params={'repo_id': repo_id}) + + else: + + libyearSQL = s.sql.text(""" + Select w.* from + ( + SELECT + rg_name, + repo_group_id, + repo_name, + d.repo_id, + repo_git, + forked_from, + repo_archived, + c.name, + c.libyear, + MAX ( C.data_collection_date ) AS most_recent_collection + FROM + ( + SELECT A.rg_name AS rg_name, + A.repo_group_id AS repo_group_id, + b.repo_name AS repo_name, + b.repo_id AS repo_id, + b.repo_git AS repo_git, + b.forked_from AS forked_from, + b.repo_archived AS repo_archived + FROM + repo_groups A, + repo b + WHERE + A.repo_group_id = b.repo_group_id + ORDER BY + rg_name, + repo_name + ) d, + ( + SELECT DISTINCT + f.repo_id, + f.NAME, + f.libyear, + f.data_collection_date + FROM + ( SELECT repo_id, NAME, MAX ( data_collection_date ) AS data_collection_date FROM augur_data.repo_deps_libyear GROUP BY repo_id, NAME ORDER BY NAME ) e, + augur_data.repo_deps_libyear f + WHERE + e.data_collection_date = f.data_collection_date and + e.repo_id = f.repo_id + ORDER BY + NAME + ) C + WHERE + d.repo_id = C.repo_id + GROUP BY + rg_name, + repo_git, + repo_group_id, + repo_name, + d.repo_id, + forked_from, + repo_archived, + c.name, + c.libyear + ORDER BY + repo_id) w, + repo_groups y, + repo z + where w.repo_id=z.repo_id and + y.repo_group_id=z.repo_group_id + and z.repo_group_id = :repo_group_id + """) + + with current_app.engine.connect() as conn: + results = pd.read_sql(libyearSQL, conn, params={'repo_group_id': repo_group_id}) + return results diff --git a/augur/api/metrics/pull_request.py b/augur/api/metrics/pull_request.py index 75116b5e54..447c9557ae 100644 --- a/augur/api/metrics/pull_request.py +++ b/augur/api/metrics/pull_request.py @@ -3,14 +3,13 @@ Metrics that provide data about pull requests & their associated activity """ -import datetime +from datetime import datetime import sqlalchemy as s import pandas as pd from flask import current_app from augur.api.util import register_metric - @register_metric() def pull_requests_new(repo_group_id, repo_id=None, period='day', begin_date=None, end_date=None): """ @@ -31,14 +30,15 @@ def pull_requests_new(repo_group_id, repo_id=None, period='day', begin_date=None if repo_id: new_pull_requests_query = s.sql.text(""" SELECT DATE_TRUNC(:period, pr_created_at) AS created_date, - COUNT(pr_id) AS new_pull_requests + COUNT(*) AS new_pull_requests FROM pull_requests WHERE repo_id = :repo_id AND pr_created_at BETWEEN :begin_date AND :end_date GROUP BY created_date """) - results = pd.read_sql(new_pull_requests_query, current_app.engine, params={'repo_id': repo_id, 'period': period, + with current_app.engine.connect() as conn: + results = pd.read_sql(new_pull_requests_query, conn, params={'repo_id': repo_id, 'period': period, 'begin_date': begin_date, 'end_date': end_date}) else: @@ -51,8 +51,9 @@ def pull_requests_new(repo_group_id, repo_id=None, period='day', begin_date=None GROUP BY created_date """) - results = pd.read_sql(new_pull_requests_query, current_app.engine, - params={'repo_group_id': repo_group_id, 'period': period, + with current_app.engine.connect() as conn: + results = pd.read_sql(new_pull_requests_query, conn, + params={'repo_group_id': repo_group_id, 'period': period, 'begin_date': begin_date, 'end_date': end_date}) @@ -73,7 +74,7 @@ def pull_requests_merge_contributor_new(repo_group_id, repo_id=None, period='day if not begin_date: begin_date = '1970-1-1 00:00:01' if not end_date: - end_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + end_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S') if repo_id: commitNewContributor = s.sql.text(""" @@ -129,7 +130,7 @@ def pull_requests_closed_no_merge(repo_group_id, repo_id=None, period='day', beg if not begin_date: begin_date = '1970-1-1 00:00:01' if not end_date: - end_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + end_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S') if repo_id: closedNoMerge = s.sql.text(""" @@ -181,7 +182,7 @@ def reviews(repo_group_id, repo_id=None, period='day', begin_date=None, end_date if not begin_date: begin_date = '1970-1-1' if not end_date: - end_date = datetime.datetime.now().strftime('%Y-%m-%d') + end_date = datetime.now().strftime('%Y-%m-%d') if not repo_id: reviews_SQL = s.sql.text(""" @@ -242,7 +243,7 @@ def reviews_accepted(repo_group_id, repo_id=None, period='day', begin_date=None, if not begin_date: begin_date = '1970-1-1' if not end_date: - end_date = datetime.datetime.now().strftime('%Y-%m-%d') + end_date = datetime.now().strftime('%Y-%m-%d') if not repo_id: reviews_accepted_SQL = s.sql.text(""" @@ -303,7 +304,7 @@ def reviews_declined(repo_group_id, repo_id=None, period='day', begin_date=None, if not begin_date: begin_date = '1970-1-1' if not end_date: - end_date = datetime.datetime.now().strftime('%Y-%m-%d') + end_date = datetime.now().strftime('%Y-%m-%d') if not repo_id: reviews_declined_SQL = s.sql.text(""" @@ -363,7 +364,7 @@ def review_duration(repo_group_id, repo_id=None, begin_date=None, end_date=None) if not begin_date: begin_date = '1970-1-1' if not end_date: - end_date = datetime.datetime.now().strftime('%Y-%m-%d') + end_date = datetime.now().strftime('%Y-%m-%d') if not repo_id: review_duration_SQL = s.sql.text(""" @@ -428,7 +429,7 @@ def pull_request_acceptance_rate(repo_group_id, repo_id=None, begin_date=None, e if not begin_date: begin_date = '1970-1-1 00:00:01' if not end_date: - end_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + end_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S') if not repo_id: prAccRateSQL = s.sql.text(""" @@ -517,7 +518,7 @@ def pull_request_average_time_to_close(repo_group_id, repo_id=None, group_by='mo if not begin_date: begin_date = '1970-1-1' if not end_date: - end_date = datetime.datetime.now().strftime('%Y-%m-%d') + end_date = datetime.now().strftime('%Y-%m-%d') unit_options = ['year', 'month', 'week', 'day'] @@ -606,6 +607,11 @@ def pull_request_average_time_to_close(repo_group_id, repo_id=None, group_by='mo pr_all = pd.read_sql(pr_all_SQL, conn, params={'repo_id': repo_id, 'repo_group_id':repo_group_id, 'begin_date': begin_date, 'end_date': end_date}) + + + if pr_all.empty: + return [] + if not repo_id: pr_avg_time_to_close = pr_all.groupby(['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys).mean().reset_index()[['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys + ['average_{}_to_close'.format(time_unit)]] else: @@ -633,7 +639,7 @@ def pull_request_merged_status_counts(repo_group_id, repo_id=None, begin_date='1 if not begin_date: begin_date = '1970-1-1' if not end_date: - end_date = datetime.datetime.now().strftime('%Y-%m-%d') + end_date = datetime.now().strftime('%Y-%m-%d') unit_options = ['year', 'month', 'week', 'day'] @@ -719,6 +725,10 @@ def pull_request_merged_status_counts(repo_group_id, repo_id=None, begin_date='1 pr_all = pd.read_sql(pr_all_SQL, conn, params={'repo_id': repo_id, 'repo_group_id':repo_group_id, 'begin_date': begin_date, 'end_date': end_date}) + + if pr_all.empty: + return [] + if not repo_id: pr_avg_time_between_responses = pr_all.groupby(['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys).mean().reset_index()[['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys + ['average_{}_between_responses'.format(time_unit)]] else: @@ -741,7 +751,7 @@ def pull_request_average_commit_counts(repo_group_id, repo_id=None, group_by='mo if not begin_date: begin_date = '1970-1-1' if not end_date: - end_date = datetime.datetime.now().strftime('%Y-%m-%d') + end_date = datetime.now().strftime('%Y-%m-%d') unit_options = ['year', 'month', 'week', 'day'] @@ -830,6 +840,10 @@ def pull_request_average_commit_counts(repo_group_id, repo_id=None, group_by='mo pr_all = pd.read_sql(pr_all_SQL, conn, params={'repo_id': repo_id, 'repo_group_id':repo_group_id, 'begin_date': begin_date, 'end_date': end_date}) + + if pr_all.empty: + return [] + if not repo_id: pr_avg_commit_counts = pr_all.groupby(['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys).mean().reset_index()[['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys + ['average_commits_per_pull_request']] else: @@ -852,7 +866,7 @@ def pull_request_average_event_counts(repo_group_id, repo_id=None, group_by='mon if not begin_date: begin_date = '1970-1-1' if not end_date: - end_date = datetime.datetime.now().strftime('%Y-%m-%d') + end_date = datetime.now().strftime('%Y-%m-%d') unit_options = ['year', 'month', 'week', 'day'] @@ -996,6 +1010,10 @@ def pull_request_average_event_counts(repo_group_id, repo_id=None, group_by='mon for name in count_names.copy(): average_count_names.append('average_' + name) + + if pr_all.empty: + return [] + if not repo_id: pr_avg_event_counts = pr_all.groupby(['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys).mean().reset_index()[['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys + average_count_names] else: @@ -1019,7 +1037,7 @@ def pull_request_average_time_to_responses_and_close(repo_group_id, repo_id=None if not begin_date: begin_date = '1970-1-1' if not end_date: - end_date = datetime.datetime.now().strftime('%Y-%m-%d') + end_date = datetime.now().strftime('%Y-%m-%d') unit_options = ['year', 'month', 'week', 'day'] @@ -1115,6 +1133,9 @@ def pull_request_average_time_to_responses_and_close(repo_group_id, repo_id=None params={'repo_id': repo_id, 'repo_group_id':repo_group_id, 'begin_date': begin_date, 'end_date': end_date}) + if pr_all.empty: + return [] + if not repo_id: avg_pr_time_to_responses_and_close = pr_all.groupby(['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys).mean().reset_index()[['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys + ['average_{}_to_first_response'.format(time_unit), 'average_{}_to_last_response'.format(time_unit), 'average_{}_to_close'.format(time_unit)]] else: @@ -1135,7 +1156,7 @@ def pull_request_merged_status_counts(repo_group_id, repo_id=None, begin_date='1 """ if not end_date: - end_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + end_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S') unit_options = ['year', 'month', 'week', 'day'] time_group_bys = [] diff --git a/augur/api/metrics/repo_meta.py b/augur/api/metrics/repo_meta.py index 256469abac..ffc8fc84ef 100644 --- a/augur/api/metrics/repo_meta.py +++ b/augur/api/metrics/repo_meta.py @@ -285,6 +285,10 @@ def nadia_project_labeling_badge(repo_group_id, repo_id=None): with current_app.engine.connect() as conn: raw_df = pd.read_sql(stars_count_SQL, conn) + + if raw_df.empty: + return {"status": "Not enough data"} + stargazers_count = int(raw_df.at[0,'stars']) repo_name = str(raw_df.at[0,'repo_name']) diff --git a/augur/api/metrics/toss.py b/augur/api/metrics/toss.py index 99c7683eb8..40a4a12b00 100644 --- a/augur/api/metrics/toss.py +++ b/augur/api/metrics/toss.py @@ -28,7 +28,7 @@ def toss_pull_request_acceptance_rate(repo_id, begin_date=None, end_date=None, g ( SELECT COUNT ( pull_request_events.pull_request_id ) AS num_approved, - repo_id + pull_requests.repo_id FROM pull_requests JOIN pull_request_events ON pull_request_events.pull_request_id = pull_requests.pull_request_id @@ -39,12 +39,12 @@ def toss_pull_request_acceptance_rate(repo_id, begin_date=None, end_date=None, g AND pull_request_events.created_at BETWEEN :begin_date AND :end_date GROUP BY - repo_id + pull_requests.repo_id ) merged JOIN ( SELECT COUNT ( pull_request_events.pull_request_id ) AS num_opened, - repo_id + pull_requests.repo_id FROM pull_requests JOIN pull_request_events ON pull_request_events.pull_request_id = pull_requests.pull_request_id @@ -54,7 +54,7 @@ def toss_pull_request_acceptance_rate(repo_id, begin_date=None, end_date=None, g AND pull_request_events.created_at BETWEEN :begin_date AND :end_date GROUP BY - repo_id + pull_requests.repo_id ) opened ON merged.repo_id = opened.repo_id """) diff --git a/augur/api/routes/config.py b/augur/api/routes/config.py index 6a2f82976e..08618091a9 100644 --- a/augur/api/routes/config.py +++ b/augur/api/routes/config.py @@ -8,6 +8,7 @@ # Disable the requirement for SSL by setting env["AUGUR_DEV"] = True from augur.application.config import get_development_flag +from augur.application.db.lib import get_session from augur.application.db.models import Config from augur.application.config import AugurConfig from augur.application.db.session import DatabaseSession @@ -45,7 +46,7 @@ def update_config(): update_dict = request.get_json() - with DatabaseSession(logger, engine=current_app.engine) as session: + with get_session() as session: for section, data in update_dict.items(): diff --git a/augur/api/routes/dei.py b/augur/api/routes/dei.py index 990a6e7368..621c89604d 100644 --- a/augur/api/routes/dei.py +++ b/augur/api/routes/dei.py @@ -92,7 +92,7 @@ def core_task_success_util_gen(repo_git): deiHook = CollectionRequest("core",primary_enabled_phases) deiHook.repo_list = [repo_url] - singleRoutine = AugurTaskRoutine(session,[deiHook]) + singleRoutine = AugurTaskRoutine(logger, session,[deiHook]) singleRoutine.start_data_collection() #start_block_of_repos(logger, session, [repo_url], primary_enabled_phases, "new") diff --git a/augur/api/routes/pull_request_reports.py b/augur/api/routes/pull_request_reports.py index 9520fc21f7..13aea31e8d 100644 --- a/augur/api/routes/pull_request_reports.py +++ b/augur/api/routes/pull_request_reports.py @@ -21,6 +21,12 @@ from bokeh.models.glyphs import Rect from bokeh.transform import dodge, factor_cmap, transform +# from selenium.webdriver import Firefox, FirefoxOptions +# options = FirefoxOptions() +# options.headless = True +# webdriver = Firefox(options=options) +#export_png(item, path, webdriver=webdriver) + warnings.filterwarnings('ignore') from augur.api.routes import AUGUR_API_VERSION @@ -604,6 +610,7 @@ def average_commits_per_PR(): # opts = FirefoxOptions() # opts.add_argument("--headless") # driver = webdriver.Firefox(firefox_options=opts) + # filename = export_png(grid, timeout=180, webdriver=webdriver) filename = export_png(grid, timeout=180) return send_file(filename) diff --git a/augur/api/routes/util.py b/augur/api/routes/util.py index 457afaf6ed..16d3c4db1d 100644 --- a/augur/api/routes/util.py +++ b/augur/api/routes/util.py @@ -5,13 +5,20 @@ import sqlalchemy as s import pandas as pd import json -from flask import Response, current_app +from flask import Response, current_app, jsonify from augur.application.db.lib import get_value from augur.application.logs import AugurLogger logger = AugurLogger("augur").get_logger() +@app.route("/api") +def get_api_version(): + return jsonify({ + "status": "up", + "route": AUGUR_API_VERSION + }) + @app.route('/{}/repo-groups'.format(AUGUR_API_VERSION)) def get_all_repo_groups(): #TODO: make this name automatic - wrapper? repoGroupsSQL = s.sql.text(""" diff --git a/augur/api/server.py b/augur/api/server.py index 64a4e94bf5..e66228c518 100644 --- a/augur/api/server.py +++ b/augur/api/server.py @@ -298,10 +298,11 @@ def create_cache_manager() -> CacheManager: """ cache_config = { - 'cache.type': 'file', - 'cache.data_dir': 'runtime/cache/', - 'cache.lock_dir': 'runtime/cache/' -} + 'cache.type': 'file', + # Allow setting cache directories via environment variables + 'cache.data_dir': Path(env.setdefault("CACHE_DATADIR", 'runtime/cache/')), + 'cache.lock_dir': Path(env.setdefault("CACHE_LOCKDIR", 'runtime/cache/')), + } if not os.path.exists(cache_config['cache.data_dir']): os.makedirs(cache_config['cache.data_dir']) @@ -733,6 +734,3 @@ def dispatch_request(self): cache_manager = create_cache_manager() server_cache = get_server_cache(cache_manager) - - - diff --git a/augur/api/view/api.py b/augur/api/view/api.py index cbd7e4a0f1..21d182024f 100644 --- a/augur/api/view/api.py +++ b/augur/api/view/api.py @@ -2,7 +2,7 @@ import re from flask_login import current_user, login_required from augur.application.db.models import Repo, RepoGroup, UserGroup, UserRepo -from augur.tasks.frontend import add_org_repo_list, parse_org_and_repo_name, parse_org_name +from augur.tasks.frontend import add_github_orgs_and_repos, parse_org_and_repo_name, parse_org_name, add_gitlab_repos from .utils import * from ..server import app from augur.application.db.session import DatabaseSession @@ -14,18 +14,6 @@ def cache(file=None): return redirect(url_for('static', filename="cache")) return redirect(url_for('static', filename="cache/" + toCacheFilename(file, False))) - -def add_existing_repo_to_group(session, user_id, group_name, repo_id): - - logger.info("Adding existing repo to group") - - group_id = UserGroup.convert_group_name_to_id(session, user_id, group_name) - if group_id is None: - return False - - result = UserRepo.insert(session, repo_id, group_id) - if not result: - return False def add_existing_org_to_group(session, user_id, group_name, rg_id): @@ -48,6 +36,8 @@ def add_existing_org_to_group(session, user_id, group_name, rg_id): @login_required def av_add_user_repo(): + print("Adding user repos") + urls = request.form.get('urls') group = request.form.get("group_name") @@ -68,58 +58,51 @@ def av_add_user_repo(): invalid_urls = [] - with DatabaseSession(logger, current_app.engine) as session: - for url in urls: - - # matches https://github.com/{org}/ or htts://github.com/{org} - if (org_name := Repo.parse_github_org_url(url)): - rg_obj = RepoGroup.get_by_name(session, org_name) - if rg_obj: - # add the orgs repos to the group - add_existing_org_to_group(session, current_user.user_id, group, rg_obj.repo_group_id) - - # matches https://github.com/{org}/{repo}/ or htts://github.com/{org}/{repo} - elif Repo.parse_github_repo_url(url)[0]: - org_name, repo_name = Repo.parse_github_repo_url(url) - repo_git = f"https://github.com/{org_name}/{repo_name}" - repo_obj = Repo.get_by_repo_git(session, repo_git) - if repo_obj: - add_existing_repo_to_group(session, current_user.user_id, group, repo_obj.repo_id) - - # matches /{org}/{repo}/ or /{org}/{repo} or {org}/{repo}/ or {org}/{repo} - elif (match := parse_org_and_repo_name(url)): - org, repo = match.groups() - repo_git = f"https://github.com/{org}/{repo}" - repo_obj = Repo.get_by_repo_git(session, repo_git) - if repo_obj: - add_existing_repo_to_group(session, current_user.user_id, group, repo_obj.repo_id) + orgs = [] + repo_urls = [] + gitlab_repo_urls = [] + for url in urls: + + # matches https://github.com/{org}/ or htts://github.com/{org} + if (org_name := Repo.parse_github_org_url(url)): + orgs.append(org_name) + + # matches https://github.com/{org}/{repo}/ or htts://github.com/{org}/{repo} + elif Repo.parse_github_repo_url(url)[0]: + repo_urls.append(url) + + # matches /{org}/{repo}/ or /{org}/{repo} or {org}/{repo}/ or {org}/{repo} + elif (match := parse_org_and_repo_name(url)): + org, repo = match.groups() + repo_git = f"https://github.com/{org}/{repo}" + repo_urls.append(repo_git) + + # matches /{org}/ or /{org} or {org}/ or {org} + elif (match := parse_org_name(url)): + org_name = match.group(1) + orgs.append(org_name) + + # matches https://gitlab.com/{org}/{repo}/ or http://gitlab.com/{org}/{repo} + elif Repo.parse_gitlab_repo_url(url)[0]: + + org_name, repo_name = Repo.parse_gitlab_repo_url(url) + repo_git = f"https://gitlab.com/{org_name}/{repo_name}" - # matches /{org}/ or /{org} or {org}/ or {org} - elif (match := parse_org_name(url)): - org_name = match.group(1) - rg_obj = RepoGroup.get_by_name(session, org_name) - logger.info(rg_obj) - if rg_obj: - # add the orgs repos to the group - add_existing_org_to_group(session, current_user.user_id, group, rg_obj.repo_group_id) - - # matches https://gitlab.com/{org}/{repo}/ or http://gitlab.com/{org}/{repo} - elif Repo.parse_gitlab_repo_url(url)[0]: - - org_name, repo_name = Repo.parse_github_repo_url(url) - repo_git = f"https://gitlab.com/{org_name}/{repo_name}" - - # TODO: gitlab ensure the whole repo git is inserted so it can be found here - repo_obj = Repo.get_by_repo_git(session, repo_git) - if repo_obj: - add_existing_repo_to_group(session, current_user.user_id, group, repo_obj.repo_id) - - else: - invalid_urls.append(url) - - if urls: - urls = [url.lower() for url in urls] - add_org_repo_list.si(current_user.user_id, group, urls).apply_async() + gitlab_repo_urls.append(repo_git) + else: + invalid_urls.append(url) + + + + if orgs or repo_urls: + repo_urls = [url.lower() for url in repo_urls] + orgs = [url.lower() for url in orgs] + flash(f"Adding repos: {repo_urls}") + flash(f"Adding orgs: {orgs}") + add_github_orgs_and_repos.si(current_user.user_id, group, orgs, repo_urls).apply_async() + + if gitlab_repo_urls: + add_gitlab_repos(current_user.user_id, group, gitlab_repo_urls) flash("Adding repos and orgs in the background") diff --git a/augur/api/view/augur_view.py b/augur/api/view/augur_view.py index 5166f2c5a3..ff4b25145c 100644 --- a/augur/api/view/augur_view.py +++ b/augur/api/view/augur_view.py @@ -48,6 +48,7 @@ def internal_server_error(error): traceback.print_tb(error.__traceback__, file=errout) # traceback.print_exception(error, file=errout) stacktrace = errout.getvalue() + stacktrace += f"\n{type(error).__name__}: {str(error)}" errout.close() except Exception as e: logger.error(e) diff --git a/augur/api/view/init.py b/augur/api/view/init.py index b0b4b27446..869b383a62 100644 --- a/augur/api/view/init.py +++ b/augur/api/view/init.py @@ -1,3 +1,4 @@ +import os from pathlib import Path from .server import Environment from augur.application.logs import AugurLogger @@ -18,7 +19,9 @@ def init_settings(): settings["cache_expiry"] = 604800 settings["serving"] = "http://augur.chaoss.io/api/unstable" settings["pagination_offset"] = 25 - settings["reports"] = "reports.yml" + # Put reports.yml in the same directory as the config file + config_dir = configFile.parent + settings["reports"] = os.path.join(config_dir, "reports.yml") settings["session_key"] = secrets.token_hex() def write_settings(current_settings): @@ -91,4 +94,4 @@ def write_settings(current_settings): # Initialize logging def init_logging(): global logger - logger = AugurLogger("augur_view", reset_logfiles=True).get_logger() + logger = AugurLogger("augur_view", reset_logfiles=False).get_logger() diff --git a/augur/api/view/server/Environment.py b/augur/api/view/server/Environment.py index 409a5975e5..76b8207ca5 100644 --- a/augur/api/view/server/Environment.py +++ b/augur/api/view/server/Environment.py @@ -49,4 +49,4 @@ def __str__(self)-> str: return str(os.environ) def __iter__(self): - return (item for item in os.environ.items) \ No newline at end of file + return (item for item in os.environ.items()) \ No newline at end of file diff --git a/augur/api/view/server/__init__.py b/augur/api/view/server/__init__.py index 2a54a556f7..e919a597a8 100644 --- a/augur/api/view/server/__init__.py +++ b/augur/api/view/server/__init__.py @@ -1,3 +1,2 @@ -from .Environment import Environment -from .ServerThread import ServerThread from .LoginException import LoginException +from .Environment import Environment \ No newline at end of file diff --git a/augur/application/cli/__init__.py b/augur/application/cli/__init__.py index e07e880bd9..e68af307bb 100644 --- a/augur/application/cli/__init__.py +++ b/augur/application/cli/__init__.py @@ -3,9 +3,10 @@ from functools import update_wrapper import os import sys -import socket import re import json +import httpx +import traceback from augur.application.db.engine import DatabaseEngine from augur.application.db import get_engine, dispose_database_engine @@ -16,14 +17,32 @@ def test_connection(function_internet_connection): @click.pass_context def new_func(ctx, *args, **kwargs): usage = re.search(r"Usage:\s(.*)\s\[OPTIONS\]", str(ctx.get_usage())).groups()[0] - try: - #try to ping google's dns server - socket.create_connection(("8.8.8.8",53)) - return ctx.invoke(function_internet_connection, *args, **kwargs) - except OSError as e: - print(e) - print(f"\n\n{usage} command setup failed\nYou are not connect to the internet. Please connect to the internet to run Augur\n") - sys.exit() + success = False + with httpx.Client() as client: + try: + _ = client.request( + method="GET", url="http://chaoss.community", timeout=10, follow_redirects=True) + success = True + except (TimeoutError, httpx.TimeoutException): + print("Request timed out.") + except httpx.NetworkError as e: + print(f"Network Error: {httpx.NetworkError}") + print(traceback.format_exc()) + except httpx.ProtocolError as e: + print(f"Protocol Error: {httpx.ProtocolError}") + print(traceback.format_exc()) + + if not success: + print( + f""" + \n\n{usage} command setup failed. + There was an error while testing for network connectivity + Please check your connection to the internet to run Augur + Consider setting http_proxy variables for limited access installations.""" + ) + sys.exit(-1) + + return ctx.invoke(function_internet_connection, *args, **kwargs) return update_wrapper(new_func, function_internet_connection) @@ -69,7 +88,7 @@ def new_func(ctx, *args, **kwargs): print(f"\n\n{usage} command setup failed\nERROR: connecting to database\nHINT: The {incorrect_values} may be incorrectly specified in {location}\n") engine.dispose() - sys.exit() + sys.exit(-2) return update_wrapper(new_func, function_db_connection) diff --git a/augur/application/cli/_multicommand.py b/augur/application/cli/_multicommand.py index 2a1bfd1c71..19392b2742 100644 --- a/augur/application/cli/_multicommand.py +++ b/augur/application/cli/_multicommand.py @@ -30,7 +30,6 @@ def get_command(self, ctx, name): # Check that the command exists before importing if not cmdfile.is_file(): - return # Prefer to raise exception instead of silcencing it diff --git a/augur/application/cli/api.py b/augur/application/cli/api.py index d716957c0b..50044de7cf 100644 --- a/augur/application/cli/api.py +++ b/augur/application/cli/api.py @@ -14,15 +14,16 @@ from augur.application.db.session import DatabaseSession from augur.application.logs import AugurLogger -from augur.application.cli import test_connection, test_db_connection, with_database +from augur.application.cli import test_connection, test_db_connection, with_database, DatabaseContext from augur.application.cli._cli_util import _broadcast_signal_to_processes, raise_open_file_limit, clear_redis_caches, clear_rabbitmq_messages from augur.application.db.lib import get_value -logger = AugurLogger("augur", reset_logfiles=True).get_logger() +logger = AugurLogger("augur", reset_logfiles=False).get_logger() @click.group('api', short_help='Commands for controlling the backend API server') -def cli(): - pass +@click.pass_context +def cli(ctx): + ctx.obj = DatabaseContext() @cli.command("start") @click.option("--development", is_flag=True, default=False, help="Enable development mode") diff --git a/augur/application/cli/backend.py b/augur/application/cli/backend.py index a0480adab4..34d0343e66 100644 --- a/augur/application/cli/backend.py +++ b/augur/application/cli/backend.py @@ -10,13 +10,17 @@ import logging import psutil import signal -from redis.exceptions import ConnectionError as RedisConnectionError import uuid import traceback +import requests +from redis.exceptions import ConnectionError as RedisConnectionError from urllib.parse import urlparse from augur.tasks.start_tasks import augur_collection_monitor, create_collection_status_records from augur.tasks.git.facade_tasks import clone_repos +from augur.tasks.github.contributors import process_contributors +from augur.tasks.github.util.github_api_key_handler import GithubApiKeyHandler +from augur.tasks.gitlab.gitlab_api_key_handler import GitlabApiKeyHandler from augur.tasks.data_analysis.contributor_breadth_worker.contributor_breadth_worker import contributor_breadth_model from augur.tasks.init.redis_connection import redis_connection from augur.application.db.models import UserRepo @@ -26,6 +30,7 @@ from augur.application.cli import test_connection, test_db_connection, with_database, DatabaseContext import sqlalchemy as s +from keyman.KeyClient import KeyClient, KeyPublisher logger = AugurLogger("augur", reset_logfiles=True).get_logger() @@ -38,14 +43,17 @@ def cli(ctx): @cli.command("start") @click.option("--disable-collection", is_flag=True, default=False, help="Turns off data collection workers") @click.option("--development", is_flag=True, default=False, help="Enable development mode, implies --disable-collection") +@click.option("--pidfile", default="main.pid", help="File to store the controlling process ID in") @click.option('--port') @test_connection @test_db_connection @with_database @click.pass_context -def start(ctx, disable_collection, development, port): +def start(ctx, disable_collection, development, pidfile, port): """Start Augur's backend server.""" - + with open(pidfile, "w") as pidfile_io: + pidfile_io.write(str(os.getpid())) + try: if os.environ.get('AUGUR_DOCKER_DEPLOY') != "1": raise_open_file_limit(100000) @@ -59,6 +67,8 @@ def start(ctx, disable_collection, development, port): if development: os.environ["AUGUR_DEV"] = "1" logger.info("Starting in development mode") + + os.environ["AUGUR_PIDFILE"] = pidfile try: gunicorn_location = os.getcwd() + "/augur/api/gunicorn_conf.py" @@ -70,28 +80,76 @@ def start(ctx, disable_collection, development, port): if not port: port = get_value("Server", "port") + os.environ["AUGUR_PORT"] = str(port) + + if disable_collection: + os.environ["AUGUR_DISABLE_COLLECTION"] = "1" + worker_vmem_cap = get_value("Celery", 'worker_process_vmem_cap') - gunicorn_command = f"gunicorn -c {gunicorn_location} -b {host}:{port} augur.api.server:app --log-file gunicorn.log" + # create rabbit messages so if it failed on shutdown the queues are clean + cleanup_collection_status_and_rabbit(logger, ctx.obj.engine) + + # Retrieve the log directory from the configuration or default to current directory + log_dir = get_value("Logging", "logs_directory") or "." + gunicorn_log_file = os.path.join(log_dir, "gunicorn.log") + + gunicorn_command = f"gunicorn -c {gunicorn_location} -b {host}:{port} augur.api.server:app --log-file {gunicorn_log_file}" server = subprocess.Popen(gunicorn_command.split(" ")) - time.sleep(3) + logger.info("awaiting Gunicorn start") + while not server.poll(): + try: + api_response = requests.get(f"http://{host}:{port}/api") + except requests.exceptions.ConnectionError as e: + time.sleep(0.5) + continue + + if not api_response.ok: + logger.critical("Gunicorn failed to start or was not reachable. Exiting") + exit(247) + break + else: + logger.critical("Gunicorn was shut down abnormally. Exiting") + exit(247) + logger.info('Gunicorn webserver started...') logger.info(f'Augur is running at: {"http" if development else "https"}://{host}:{port}') + logger.info(f"The API is available at '{api_response.json()['route']}'") processes = start_celery_worker_processes(float(worker_vmem_cap), disable_collection) - if os.path.exists("celerybeat-schedule.db"): + celery_beat_schedule_db = os.getenv("CELERYBEAT_SCHEDULE_DB", "celerybeat-schedule.db") + if os.path.exists(celery_beat_schedule_db): logger.info("Deleting old task schedule") - os.remove("celerybeat-schedule.db") + os.remove(celery_beat_schedule_db) log_level = get_value("Logging", "log_level") celery_beat_process = None - celery_command = f"celery -A augur.tasks.init.celery_app.celery_app beat -l {log_level.lower()}" + celery_command = f"celery -A augur.tasks.init.celery_app.celery_app beat -l {log_level.lower()} -s {celery_beat_schedule_db}" celery_beat_process = subprocess.Popen(celery_command.split(" ")) - + keypub = KeyPublisher() + if not disable_collection: + if os.environ.get('AUGUR_DOCKER_DEPLOY') != "1": + orchestrator = subprocess.Popen("python keyman/Orchestrator.py".split()) + # Wait for orchestrator startup + if not keypub.wait(republish=True): + logger.critical("Key orchestrator did not respond in time") + return + + # load keys + ghkeyman = GithubApiKeyHandler(logger) + glkeyman = GitlabApiKeyHandler(logger) + + for key in ghkeyman.keys: + keypub.publish(key, "github_rest") + keypub.publish(key, "github_graphql") + + for key in glkeyman.keys: + keypub.publish(key, "gitlab_rest") + with DatabaseSession(logger, engine=ctx.obj.engine) as session: clean_collection_status(session) @@ -100,15 +158,18 @@ def start(ctx, disable_collection, development, port): create_collection_status_records.si().apply_async() time.sleep(3) + #put contributor breadth back in. Not sure why it was commented out contributor_breadth_model.si().apply_async() # start cloning repos when augur starts clone_repos.si().apply_async() + process_contributors.si().apply_async() + augur_collection_monitor.si().apply_async() else: - logger.info("Collection disabled") + logger.info("Collection disabled") try: server.wait() @@ -130,9 +191,12 @@ def start(ctx, disable_collection, development, port): if not disable_collection: try: - cleanup_after_collection_halt(logger, ctx.obj.engine) + keypub.shutdown() + cleanup_collection_status_and_rabbit(logger, ctx.obj.engine) except RedisConnectionError: pass + + os.unlink(pidfile) def start_celery_worker_processes(vmem_cap_ratio, disable_collection=False): @@ -164,22 +228,22 @@ def determine_worker_processes(ratio,maximum): process_list.append(subprocess.Popen(scheduling_worker.split(" "))) sleep_time += 6 - #60% of estimate, Maximum value of 45 - core_num_processes = determine_worker_processes(.6, 45) + #60% of estimate, Maximum value of 45 : Reduced because it can be lower + core_num_processes = determine_worker_processes(.40, 90) logger.info(f"Starting core worker processes with concurrency={core_num_processes}") core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={core_num_processes} -n core:{uuid.uuid4().hex}@%h" process_list.append(subprocess.Popen(core_worker.split(" "))) sleep_time += 6 #20% of estimate, Maximum value of 25 - secondary_num_processes = determine_worker_processes(.25, 25) + secondary_num_processes = determine_worker_processes(.39, 50) logger.info(f"Starting secondary worker processes with concurrency={secondary_num_processes}") secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={secondary_num_processes} -n secondary:{uuid.uuid4().hex}@%h -Q secondary" process_list.append(subprocess.Popen(secondary_worker.split(" "))) sleep_time += 6 #15% of estimate, Maximum value of 20 - facade_num_processes = determine_worker_processes(.15, 20) + facade_num_processes = determine_worker_processes(.17, 20) logger.info(f"Starting facade worker processes with concurrency={facade_num_processes}") facade_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={facade_num_processes} -n facade:{uuid.uuid4().hex}@%h -Q facade" @@ -204,6 +268,54 @@ def stop(ctx): augur_stop(signal.SIGTERM, logger, ctx.obj.engine) +@cli.command('stop-collection-blocking') +@test_connection +@test_db_connection +@with_database +@click.pass_context +def stop_collection(ctx): + """ + Stop collection tasks if they are running, block until complete + """ + processes = get_augur_processes() + + stopped = [] + + p: psutil.Process + for p in processes: + if p.name() == "celery": + stopped.append(p) + p.terminate() + + if not len(stopped): + logger.info("No collection processes found") + return + + _, alive = psutil.wait_procs(stopped, 5, + lambda p: logger.info(f"STOPPED: {p.pid}")) + + killed = [] + while True: + for i in range(len(alive)): + if alive[i].status() == psutil.STATUS_ZOMBIE: + logger.info(f"KILLING ZOMBIE: {alive[i].pid}") + alive[i].kill() + killed.append(i) + elif not alive[i].is_running(): + logger.info(f"STOPPED: {p.pid}") + killed.append(i) + + for i in reversed(killed): + alive.pop(i) + + if not len(alive): + break + + logger.info(f"Waiting on [{', '.join(str(p.pid for p in alive))}]") + time.sleep(0.5) + + cleanup_collection_status_and_rabbit(logger, ctx.obj.engine) + @cli.command('kill') @test_connection @test_db_connection @@ -230,10 +342,10 @@ def augur_stop(signal, logger, engine): _broadcast_signal_to_processes(augur_processes, broadcast_signal=signal, given_logger=logger) if "celery" in process_names: - cleanup_after_collection_halt(logger, engine) + cleanup_collection_status_and_rabbit(logger, engine) -def cleanup_after_collection_halt(logger, engine): +def cleanup_collection_status_and_rabbit(logger, engine): clear_redis_caches() connection_string = get_value("RabbitMQ", "connection_string") @@ -317,7 +429,7 @@ def assign_orphan_repos_to_default_user(session): repos = session.execute_sql(query).fetchall() for repo in repos: - UserRepo.insert(session,repo[0],1) + UserRepo.insert(session, repo[0],1) @cli.command('export-env') @@ -368,7 +480,7 @@ def processes(): Outputs the name/PID of all Augur server & worker processes""" augur_processes = get_augur_processes() for process in augur_processes: - logger.info(f"Found process {process.pid}") + logger.info(f"Found process {process.pid} [{process.name()}] -> Parent: {process.parent().pid}") def get_augur_processes(): augur_processes = [] diff --git a/augur/application/cli/collection.py b/augur/application/cli/collection.py index 63c433a79e..b42f1f3fcc 100644 --- a/augur/application/cli/collection.py +++ b/augur/application/cli/collection.py @@ -17,19 +17,24 @@ from augur.tasks.start_tasks import augur_collection_monitor, create_collection_status_records from augur.tasks.git.facade_tasks import clone_repos +from augur.tasks.github.util.github_api_key_handler import GithubApiKeyHandler +from augur.tasks.gitlab.gitlab_api_key_handler import GitlabApiKeyHandler from augur.tasks.data_analysis.contributor_breadth_worker.contributor_breadth_worker import contributor_breadth_model from augur.application.db.models import UserRepo from augur.application.db.session import DatabaseSession from augur.application.logs import AugurLogger from augur.application.db.lib import get_value -from augur.application.cli import test_connection, test_db_connection, with_database +from augur.application.cli import test_connection, test_db_connection, with_database, DatabaseContext from augur.application.cli._cli_util import _broadcast_signal_to_processes, raise_open_file_limit, clear_redis_caches, clear_rabbitmq_messages -logger = AugurLogger("augur", reset_logfiles=True).get_logger() +from keyman.KeyClient import KeyClient, KeyPublisher + +logger = AugurLogger("augur", reset_logfiles=False).get_logger() @click.group('server', short_help='Commands for controlling the backend API server & data collection workers') -def cli(): - pass +@click.pass_context +def cli(ctx): + ctx.obj = DatabaseContext() @cli.command("start") @click.option("--development", is_flag=True, default=False, help="Enable development mode, implies --disable-collection") @@ -50,6 +55,26 @@ def start(ctx, development): logger.error("Failed to raise open file limit!") raise e + keypub = KeyPublisher() + + orchestrator = subprocess.Popen("python keyman/Orchestrator.py".split()) + + # Wait for orchestrator startup + if not keypub.wait(republish=True): + logger.critical("Key orchestrator did not respond in time") + return + + # load keys + ghkeyman = GithubApiKeyHandler(logger) + glkeyman = GitlabApiKeyHandler(logger) + + for key in ghkeyman.keys: + keypub.publish(key, "github_rest") + keypub.publish(key, "github_graphql") + + for key in glkeyman.keys: + keypub.publish(key, "gitlab_rest") + if development: os.environ["AUGUR_DEV"] = "1" logger.info("Starting in development mode") @@ -93,6 +118,8 @@ def start(ctx, development): if p: p.terminate() + keypub.shutdown() + if celery_beat_process: logger.info("Shutting down celery beat process") celery_beat_process.terminate() @@ -124,22 +151,22 @@ def determine_worker_processes(ratio,maximum): process_list.append(subprocess.Popen(scheduling_worker.split(" "))) sleep_time += 6 - #60% of estimate, Maximum value of 45 - core_num_processes = determine_worker_processes(.6, 45) + #60% of estimate, Maximum value of 45: Reduced because not needed + core_num_processes = determine_worker_processes(.40, 90) logger.info(f"Starting core worker processes with concurrency={core_num_processes}") core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={core_num_processes} -n core:{uuid.uuid4().hex}@%h" process_list.append(subprocess.Popen(core_worker.split(" "))) sleep_time += 6 #20% of estimate, Maximum value of 25 - secondary_num_processes = determine_worker_processes(.25, 25) + secondary_num_processes = determine_worker_processes(.39, 50) logger.info(f"Starting secondary worker processes with concurrency={secondary_num_processes}") secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={secondary_num_processes} -n secondary:{uuid.uuid4().hex}@%h -Q secondary" process_list.append(subprocess.Popen(secondary_worker.split(" "))) sleep_time += 6 #15% of estimate, Maximum value of 20 - facade_num_processes = determine_worker_processes(.15, 20) + facade_num_processes = determine_worker_processes(.17, 20) logger.info(f"Starting facade worker processes with concurrency={facade_num_processes}") facade_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={facade_num_processes} -n facade:{uuid.uuid4().hex}@%h -Q facade" @@ -301,4 +328,4 @@ def assign_orphan_repos_to_default_user(session): repos = session.execute_sql(query).fetchall() for repo in repos: - UserRepo.insert(session,repo[0],1) + UserRepo.insert(session, repo[0],1) diff --git a/augur/application/cli/config.py b/augur/application/cli/config.py index e5beae92eb..1e73698fc3 100644 --- a/augur/application/cli/config.py +++ b/augur/application/cli/config.py @@ -7,6 +7,7 @@ import json import logging +from augur.application.db.models import Config from augur.application.db.session import DatabaseSession from augur.application.config import AugurConfig from augur.application.cli import DatabaseContext, test_connection, test_db_connection, with_database @@ -28,11 +29,12 @@ def cli(ctx): @click.option('--gitlab-api-key', help="GitLab API key for data collection from the GitLab API", envvar=ENVVAR_PREFIX + 'GITLAB_API_KEY') @click.option('--redis-conn-string', help="String to connect to redis cache", envvar=ENVVAR_PREFIX + 'REDIS_CONN_STRING') @click.option('--rabbitmq-conn-string', help="String to connect to rabbitmq broker", envvar=ENVVAR_PREFIX + 'RABBITMQ_CONN_STRING') +@click.option('--logs-directory', help="Directory to store logs", envvar=ENVVAR_PREFIX + 'LOGS_DIRECTORY') @test_connection @test_db_connection @with_database @click.pass_context -def init_config(ctx, github_api_key, facade_repo_directory, gitlab_api_key, redis_conn_string, rabbitmq_conn_string): +def init_config(ctx, github_api_key, facade_repo_directory, gitlab_api_key, redis_conn_string, rabbitmq_conn_string, logs_directory): if not github_api_key: @@ -98,7 +100,7 @@ def init_config(ctx, github_api_key, facade_repo_directory, gitlab_api_key, redi default_config["Facade"]["repo_directory"] = facade_repo_directory - default_config["Logging"]["logs_directory"] = ROOT_AUGUR_DIRECTORY + "/logs/" + default_config["Logging"]["logs_directory"] = logs_directory or (ROOT_AUGUR_DIRECTORY + "/logs/") config.load_config_from_dict(default_config) @@ -160,7 +162,7 @@ def add_section(ctx, section_name, file): @click.option('--section', required=True) @click.option('--setting', required=True) @click.option('--value', required=True) -@click.option('--data-type', required=True) +@click.option('--data-type') @test_connection @test_db_connection @with_database @@ -169,6 +171,12 @@ def config_set(ctx, section, setting, value, data_type): with DatabaseSession(logger, engine=ctx.obj.engine) as session: config = AugurConfig(logger, session) + + if not data_type: + result = session.query(Config).filter(Config.section_name == section, Config.setting_name == setting).all() + if not result: + return click.echo("You must specify a data-type if the setting does not already exist") + data_type = result[0].type if data_type not in config.accepted_types: print(f"Error invalid type for config. Please use one of these types: {config.accepted_types}") @@ -218,6 +226,22 @@ def config_get(ctx, section, setting): else: print(f"Error: {section} section not found in config") +@cli.command('get_all_json') +def config_get_all_json(): + data = {} + try: + with DatabaseSession(logger) as session: + sections = session.query(Config.section_name).distinct().all() + for section in sections: + data[section[0]] = {} + + for row in session.query(Config).all(): + data[row.section_name][row.setting_name] = row.value + except: + pass + + print(json.dumps(data, indent=4)) + @cli.command('clear') @test_connection @test_db_connection @@ -240,5 +264,3 @@ def clear_config(ctx): config.clear() print("Config cleared") - - diff --git a/augur/application/cli/db.py b/augur/application/cli/db.py index c2ffc9463e..c20fcf0b2e 100644 --- a/augur/application/cli/db.py +++ b/augur/application/cli/db.py @@ -14,7 +14,12 @@ import re import stat as stat_module -from augur.application.cli import test_connection, test_db_connection, with_database, DatabaseContext +from augur.application.cli import ( + test_connection, + test_db_connection, + with_database, + DatabaseContext, +) from augur.application.db.session import DatabaseSession from sqlalchemy import update @@ -23,8 +28,9 @@ logger = logging.getLogger(__name__) + @click.group("db", short_help="Database utilities") -@click.pass_context +@click.pass_context def cli(ctx): ctx.obj = DatabaseContext() @@ -36,36 +42,43 @@ def cli(ctx): @with_database @click.pass_context def add_repos(ctx, filename): - """Add repositories to Augur's database. + """Add repositories to Augur's database. The .csv file format should be repo_url,group_id NOTE: The Group ID must already exist in the REPO_Groups Table. - If you want to add an entire GitHub organization, refer to the command: augur db add-github-org""" + If you want to add an entire GitHub organization, refer to the command: augur db add-github-org""" from augur.tasks.github.util.github_task_session import GithubTaskSession from augur.util.repo_load_controller import RepoLoadController with GithubTaskSession(logger, engine=ctx.obj.engine) as session: - controller = RepoLoadController(session) + line_total = len(open(filename).readlines()) with open(filename) as upload_repos_file: data = csv.reader(upload_repos_file, delimiter=",") - for row in data: - + for line_num, row in enumerate(data): repo_data = {} repo_data["url"] = row[0] try: repo_data["repo_group_id"] = int(row[1]) except ValueError: - print(f"Invalid repo group_id: {row[1]} for Git url: `{repo_data['url']}`") + print( + f"Invalid repo group_id: {row[1]} for Git url: `{repo_data['url']}`" + ) continue - + print( - f"Inserting repo with Git URL `{repo_data['url']}` into repo group {repo_data['repo_group_id']}") - controller.add_cli_repo(repo_data) + f"Inserting repo {line_num}/{line_total} with Git URL `{repo_data['url']}` into repo group {repo_data['repo_group_id']}" + ) + succeeded, message = controller.add_cli_repo(repo_data) + if not succeeded: + logger.error(f"insert repo failed with error: {message['status']}`") + else: + logger.info(f"Repo added: {repo_data}") + print("Success") @cli.command("get-repo-groups") @@ -101,7 +114,6 @@ def add_repo_groups(ctx, filename): Create new repo groups in Augur's database """ with ctx.obj.engine.begin() as connection: - df = pd.read_sql( s.sql.text("SELECT repo_group_id FROM augur_data.repo_groups"), connection, @@ -117,7 +129,6 @@ def add_repo_groups(ctx, filename): with open(filename) as create_repo_groups_file: data = csv.reader(create_repo_groups_file, delimiter=",") for row in data: - # Handle case where there's a hanging empty row. if not row: logger.info("Skipping empty data...") @@ -137,6 +148,7 @@ def add_repo_groups(ctx, filename): f"Repo group with ID {row[1]} for repo group {row[1]} already exists, skipping..." ) + @cli.command("add-github-org") @click.argument("organization_name") @test_connection @@ -151,14 +163,13 @@ def add_github_org(ctx, organization_name): from augur.util.repo_load_controller import RepoLoadController with GithubTaskSession(logger, engine=ctx.obj.engine) as session: - controller = RepoLoadController(session) controller.add_cli_org(organization_name) + # get_db_version is a helper function to print_db_version and upgrade_db_version def get_db_version(engine): - db_version_sql = s.sql.text( """ SELECT * FROM augur_operations.augur_settings WHERE setting = 'augur_data_version' @@ -166,14 +177,12 @@ def get_db_version(engine): ) with engine.connect() as connection: - result = int(connection.execute(db_version_sql).fetchone()[2]) engine.dispose() return result - @cli.command("print-db-version") @test_connection @test_db_connection @@ -252,10 +261,10 @@ def update_api_key(ctx, api_key): ) with ctx.obj.engine.begin() as connection: - connection.execute(update_api_key_sql, api_key=api_key) logger.info(f"Updated Augur API key to: {api_key}") + @cli.command("get-api-key") @test_connection @test_db_connection @@ -282,20 +291,21 @@ def get_api_key(ctx): def check_pgpass(): augur_db_env_var = getenv("AUGUR_DB") if augur_db_env_var: - # gets the user, passowrd, host, port, and database_name out of environment variable # assumes database string of structure //:@:/ # it returns a tuple like (, , , , 0: + print("\n\nWARNING: There are duplicate keys this will slow down collection") + print("Duplicate keys".center(40)) + for key in duplicate_keys: + print(key) + + + if len(invalid_keys) > 0: + invalid_key_header = "Invalid Keys".center(40) + print("\n") + print(invalid_key_header) + for key in invalid_keys: + print(key) + print("") + + + + engine.dispose() + + +def epoch_to_local_time_with_am_pm(epoch): + local_time = datetime.fromtimestamp(epoch) + formatted_time = local_time.strftime('%I:%M %p') # This format includes the date as well + return formatted_time + + +def find_duplicates(lst): + counter = Counter(lst) + return [item for item, count in counter.items() if count > 1] + diff --git a/augur/application/cli/jumpstart.py b/augur/application/cli/jumpstart.py new file mode 100644 index 0000000000..b65255ec17 --- /dev/null +++ b/augur/application/cli/jumpstart.py @@ -0,0 +1,98 @@ +import psutil +import click +import time +import subprocess +from pathlib import Path +from datetime import datetime + +@click.group(invoke_without_command=True) +@click.pass_context +def cli(ctx): + if ctx.invoked_subcommand is None: + p = check_running() + if not p: + click.echo("Jumpstart is not running. Start it with: augur jumpstart run") + return + + click.echo(f"Connecting to Jumpstart: [{p.pid}]") + + while p.is_running() and not len(p.connections("unix")): + # Waiting for app to open fd socket + time.sleep(0.1) + + if not p.is_running(): + click.echo("Error: Jumpstart server exited abnormally") + return + + from jumpstart.tui import run_app + run_app(ctx=ctx) + +def check_running(pidfile = ".jumpstart.pid") -> psutil.Process: + jumpidf = Path(pidfile) + + try: + jumpid, create_time = jumpidf.read_text().splitlines() + jumpp = psutil.Process(int(jumpid)) + + if create_time != str(jumpp.create_time()): + # PID was reused, not the original + jumpidf.unlink() + return + + return jumpp + except (psutil.NoSuchProcess, FileNotFoundError): + return + except PermissionError: + click.echo(f"Permission denied while reading from or writing to pidfile [{str(jumpidf.resolve())}]") + +@cli.command("status") +def get_status(): + p = check_running() + + if not p: + click.echo("Jumpstart is not running") + else: + since = datetime.fromtimestamp(p.create_time()).astimezone() + delta = datetime.now().astimezone() - since + click.echo(f"Jumpstart is running at: [{p.pid}] since {since.strftime('%a %b %d, %Y %H:%M:%S %z:%Z')} [{delta}]") + +@cli.command("run") +@click.pass_context +def startup(ctx): + p = check_running() + + if not p: + click.echo("Starting") + p = launch(ctx) + else: + click.echo(f"Jumpstart is already running [{p.pid}]") + +@cli.command("processID") +def get_main_ID(): + p = check_running() + + if p: + click.echo(p.pid) + +@cli.command("shutdown") +def shutdown_server(): + p = check_running() + + if not p: + click.echo("Jumpstart is not running") + return + + click.echo("Blocking on shutdown") + p.terminate() + p.wait() + +def launch(ctx, pidfile = ".jumpstart.pid", socketfile = "jumpstart.sock"): + service = subprocess.Popen(f"python -m jumpstart.jumpstart pidfile={pidfile} socketfile={socketfile}".split()) + + # Popen object does not have create_time for some reason + ext_process = psutil.Process(service.pid) + + with open(pidfile, "w") as file: + file.write(f"{ext_process.pid}\n{ext_process.create_time()}") + + return ext_process diff --git a/augur/application/cli/tasks.py b/augur/application/cli/tasks.py index b4bec994eb..f760dfddeb 100644 --- a/augur/application/cli/tasks.py +++ b/augur/application/cli/tasks.py @@ -17,7 +17,7 @@ from augur.application.cli import test_connection, test_db_connection from augur.application.cli.backend import clear_rabbitmq_messages, raise_open_file_limit -logger = AugurLogger("augur", reset_logfiles=True).get_logger() +logger = AugurLogger("augur", reset_logfiles=False).get_logger() @click.group('celery', short_help='Commands for controlling the backend API server & data collection workers') def cli(): @@ -36,8 +36,8 @@ def start(): secondary_worker_process = None scheduling_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=1 -n scheduling:{uuid.uuid4().hex}@%h -Q scheduling" - core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=45 -n core:{uuid.uuid4().hex}@%h" - secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=25 -n secondary:{uuid.uuid4().hex}@%h -Q secondary" + core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=90 -n core:{uuid.uuid4().hex}@%h" + secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=20 -n secondary:{uuid.uuid4().hex}@%h -Q secondary" scheduling_worker_process = subprocess.Popen(scheduling_worker.split(" ")) core_worker_process = subprocess.Popen(core_worker.split(" ")) diff --git a/augur/application/config.py b/augur/application/config.py index 8998d6094e..e3e93302eb 100644 --- a/augur/application/config.py +++ b/augur/application/config.py @@ -161,7 +161,7 @@ def get_section(self, section_name) -> dict: Returns: The section data as a dict """ - query = self.session.query(Config).filter_by(section_name=section_name) + query = self.session.query(Config).filter_by(section_name=section_name).order_by(Config.setting_name.asc()) section_data = execute_session_query(query, 'all') section_dict = {} @@ -213,7 +213,7 @@ def load_config(self) -> dict: The config from the database """ # get all the sections in the config table - query = self.session.query(Config.section_name) + query = self.session.query(Config.section_name).order_by(Config.section_name.asc()) section_names = execute_session_query(query, 'all') config = {} @@ -288,6 +288,7 @@ def add_or_update_settings(self, settings: List[dict]): query = self.session.query(Config).filter(and_(Config.section_name == setting["section_name"],Config.setting_name == setting["setting_name"]) ) if execute_session_query(query, 'first') is None: + # TODO: Update to use bulk insert dicts so config doesn't require database session self.session.insert_data(setting,Config, ["section_name", "setting_name"]) else: #If setting exists. use raw update to not increase autoincrement diff --git a/augur/application/db/data_parse.py b/augur/application/db/data_parse.py index 1d18e26196..6f20fa35a7 100644 --- a/augur/application/db/data_parse.py +++ b/augur/application/db/data_parse.py @@ -138,9 +138,8 @@ def extract_needed_merge_request_assignee_data(assignees: List[dict], repo_id: i for assignee in assignees: assignee_dict = { - 'contrib_id': None, + 'contrib_id': assignee["cntrb_id"], 'repo_id': repo_id, - # TODO: Temporarily setting this to id which the id of the contributor, unitl we can get the contrib_id set and create a unique on the contrib_id and the pull_request_id 'pr_assignee_src_id': assignee["id"], 'tool_source': tool_source, 'tool_version': tool_version, @@ -284,7 +283,7 @@ def extract_pr_review_message_ref_data(comment: dict, augur_pr_review_id, github return pr_review_comment_message_ref -def extract_pr_event_data(event: dict, pr_id: int, platform_id: int, repo_id: int, tool_source: str, tool_version: str, data_source: str) -> dict: +def extract_pr_event_data(event: dict, pr_id: int, gh_src_id: int, platform_id: int, repo_id: int, tool_source: str, tool_version: str, data_source: str) -> dict: pr_event = { 'pull_request_id': pr_id, @@ -292,13 +291,13 @@ def extract_pr_event_data(event: dict, pr_id: int, platform_id: int, repo_id: in 'action': event['event'], 'action_commit_hash': None, 'created_at': event['created_at'], - 'issue_event_src_id': int(event['issue']["id"]), + 'issue_event_src_id': event["id"], 'node_id': event['node_id'], 'node_url': event['url'], 'tool_source': tool_source, 'tool_version': tool_version, 'data_source': data_source, - 'pr_platform_event_id': int(event['issue']["id"]), + 'pr_platform_event_id': gh_src_id, 'platform_id': platform_id, 'repo_id': repo_id } @@ -792,6 +791,11 @@ def extract_needed_pr_data_from_gitlab_merge_request(pr, repo_id, tool_source, t Returns: Parsed pr dict """ + pr_closed_datetime = pr['closed_at'] + pr_merged_datetime = pr['merged_at'] + + if not pr_closed_datetime: + pr_closed_datetime = pr_merged_datetime pr_dict = { 'repo_id': repo_id, @@ -807,13 +811,12 @@ def extract_needed_pr_data_from_gitlab_merge_request(pr, repo_id, tool_source, t 'pr_src_state': pr['state'], 'pr_src_locked': pr['discussion_locked'], 'pr_src_title': pr['title'], - # TODO: Add contributor logic for gitlab - 'pr_augur_contributor_id': None, + 'pr_augur_contributor_id': pr["cntrb_id"], 'pr_body': pr['description'], 'pr_created_at': pr['created_at'], 'pr_updated_at': pr['updated_at'], - 'pr_closed_at': pr['closed_at'], - 'pr_merged_at': pr['merged_at'], + 'pr_closed_at': pr_closed_datetime, + 'pr_merged_at': pr_merged_datetime, 'pr_merge_commit_sha': pr['merge_commit_sha'], 'pr_teams': None, 'pr_milestone': pr['milestone'].get('title') if pr['milestone'] else None, @@ -1153,7 +1156,7 @@ def extract_needed_gitlab_issue_message_ref_data(message: dict, issue_id: int, r return message_ref_dict -def extract_needed_gitlab_message_data(comment: dict, platform_id: int, tool_source: str, tool_version: str, data_source: str): +def extract_needed_gitlab_message_data(comment: dict, platform_id: int, repo_id: int, tool_source: str, tool_version: str, data_source: str): """ Extract specific metadata for a comment from an api response and connect it to the relevant platform id. @@ -1171,6 +1174,7 @@ def extract_needed_gitlab_message_data(comment: dict, platform_id: int, tool_sou """ comment_dict = { + "repo_id": repo_id, "pltfrm_id": platform_id, "msg_text": comment['body'], "msg_timestamp": comment['created_at'], diff --git a/augur/application/db/lib.py b/augur/application/db/lib.py index c1da707dbf..3e22b18be4 100644 --- a/augur/application/db/lib.py +++ b/augur/application/db/lib.py @@ -1,9 +1,20 @@ -import sqlalchemy as s +import re +import time +import random import logging -from typing import List, Any, Optional -from augur.application.db.models import Config -from augur.application.db import get_session +import sqlalchemy as s +from sqlalchemy import func +from sqlalchemy.exc import DataError +from sqlalchemy.dialects import postgresql +from sqlalchemy.exc import OperationalError +from psycopg2.errors import DeadlockDetected +from typing import List, Any, Optional, Union + +from augur.application.db.models import Config, Repo, Commit, WorkerOauth, Issue, PullRequest, PullRequestReview, ContributorsAlias,UnresolvedCommitEmail, Contributor, CollectionStatus, UserGroup, RepoGroup +from augur.tasks.util.collection_state import CollectionState +from augur.application.db import get_session, get_engine from augur.application.db.util import execute_session_query +from augur.application.db.session import remove_duplicates_by_uniques, remove_null_characters_from_list_of_dicts logger = logging.getLogger("db_lib") @@ -15,7 +26,7 @@ def convert_type_of_value(config_dict, logger=None): if data_type == "str" or data_type is None: return config_dict - elif data_type == "int": + if data_type == "int": config_dict["value"] = int(config_dict["value"]) elif data_type == "bool": @@ -95,3 +106,510 @@ def get_value(section_name: str, setting_name: str) -> Optional[Any]: setting_dict = convert_type_of_value(setting_dict, logger) return setting_dict["value"] + + +def execute_sql(sql_text): + + engine = get_engine() + + with engine.begin() as connection: + + return_data = connection.execute(sql_text) + + return return_data + +def fetchall_data_from_sql_text(sql_text): + + engine = get_engine() + + with engine.begin() as connection: + + result = connection.execute(sql_text) + return [dict(row) for row in result.mappings()] + +def get_repo_by_repo_git(repo_git: str): + + with get_session() as session: + + query = session.query(Repo).filter(Repo.repo_git == repo_git) + repo = execute_session_query(query, 'one') + + return repo + +def get_repo_by_repo_id(repo_id): + + with get_session() as session: + + query = session.query(Repo).filter(Repo.repo_id == repo_id) + repo = execute_session_query(query, 'one') + + return repo + +def get_github_repo_by_src_id(src_id): + + with get_session() as session: + + query = session.query(Repo).filter(Repo.repo_src_id == src_id, Repo.repo_git.ilike(f'%https://github.com%')) + repo = execute_session_query(query, 'first') + + return repo + +def get_gitlab_repo_by_src_id(src_id): + + with get_session() as session: + + query = session.query(Repo).filter(Repo.repo_src_id == src_id, Repo.repo_git.ilike(f'%https://gitlab.com%')) + repo = execute_session_query(query, 'first') + + return repo + + +def remove_working_commits_by_repo_id_and_hashes(repo_id, commit_hashes): + + remove_working_commits = s.sql.text("""DELETE FROM working_commits + WHERE repos_id = :repo_id AND working_commit IN :hashes + """).bindparams(repo_id=repo_id,hashes=tuple(commit_hashes)) + + execute_sql(remove_working_commits) + +def remove_working_commits_by_repo_id(repo_id): + + remove_working_commits = s.sql.text("""DELETE FROM working_commits WHERE repos_id=:repo_id""").bindparams(repo_id=repo_id) + execute_sql(remove_working_commits) + +def remove_commits_by_repo_id_and_hashes(repo_id, commit_hashes): + + remove_commit = s.sql.text("""DELETE FROM commits + WHERE repo_id=:repo_id + AND cmt_commit_hash IN :hashes""").bindparams(repo_id=repo_id,hashes=tuple(commit_hashes)) + execute_sql(remove_commit) + + +def remove_commits_by_repo_id(repo_id): + + remove_commits = s.sql.text("""DELETE FROM commits WHERE repo_id=:repo_id""").bindparams(repo_id=repo_id) + execute_sql(remove_commits) + +def get_working_commits_by_repo_id(repo_id): + + query = s.sql.text("""SELECT working_commit FROM working_commits WHERE repos_id=:repo_id + """).bindparams(repo_id=repo_id) + + try: + working_commits = fetchall_data_from_sql_text(query) + except: + working_commits = [] + + return working_commits + +def get_missing_commit_message_hashes(repo_id): + + fetch_missing_hashes_sql = s.sql.text(""" + SELECT DISTINCT cmt_commit_hash FROM commits + WHERE repo_id=:repo_id + AND cmt_commit_hash NOT IN + (SELECT DISTINCT cmt_hash FROM commit_messages WHERE repo_id=:repo_id); + """).bindparams(repo_id=repo_id) + + try: + missing_commit_hashes = fetchall_data_from_sql_text(fetch_missing_hashes_sql) + except: + missing_commit_hashes = [] + + return missing_commit_hashes + +def get_worker_oauth_keys(platform: str): + + with get_session() as session: + + results = session.query(WorkerOauth).filter(WorkerOauth.platform == platform).order_by(func.random()).all() + + return [row.access_token for row in results] + +def get_active_repo_count(collection_type): + + with get_session() as session: + + return session.query(CollectionStatus).filter(getattr(CollectionStatus,f"{collection_type}_status" ) == CollectionState.COLLECTING.value).count() + + +def facade_bulk_insert_commits(logger, records): + + with get_session() as session: + + try: + session.execute( + s.insert(Commit), + records, + ) + session.commit() + except Exception as e: + session.rollback() + + if len(records) > 1: + logger.error(f"Ran into issue when trying to insert commits \n Error: {e}") + + #split list into halves and retry insert until we isolate offending record + firsthalfRecords = records[:len(records)//2] + secondhalfRecords = records[len(records)//2:] + + facade_bulk_insert_commits(logger, firsthalfRecords) + facade_bulk_insert_commits(logger, secondhalfRecords) + elif len(records) == 1 and isinstance(e,DataError) and "time zone displacement" in f"{e}": + commit_record = records[0] + #replace incomprehensible dates with epoch. + #2021-10-11 11:57:46 -0500 + + # placeholder_date = "1970-01-01 00:00:15 -0500" + placeholder_date = commit_record['author_timestamp'] + + # Reconstruct timezone portion of the date string to UTC + placeholder_date = re.split("[-+]", placeholder_date) + placeholder_date.pop() + placeholder_date = "-".join(placeholder_date) + "+0000" + + #Check for improper utc timezone offset + #UTC timezone offset should be between -14:00 and +14:00 + + commit_record['author_timestamp'] = placeholder_date + commit_record['committer_timestamp'] = placeholder_date + + session.execute( + s.insert(Commit), + [commit_record], + ) + session.commit() + else: + raise e + + +def batch_insert_contributors(logger, data: Union[List[dict], dict]) -> Optional[List[dict]]: + + batch_size = 1000 + + for i in range(0, len(data), batch_size): + batch = data[i:i + batch_size] + + bulk_insert_dicts(logger, batch, Contributor, ['cntrb_id']) + + + +def bulk_insert_dicts(logger, data: Union[List[dict], dict], table, natural_keys: List[str], return_columns: Optional[List[str]] = None, string_fields: Optional[List[str]] = None, on_conflict_update:bool = True) -> Optional[List[dict]]: + + if isinstance(data, list) is False: + + # if a dict is passed to data then + # convert it to a list with one value + if isinstance(data, dict) is True: + data = [data] + + else: + logger.error("Data must be a list or a dict") + return None + + if len(data) == 0: + # self.logger.info("Gave no data to insert, returning...") + return None + + if isinstance(data[0], dict) is False: + logger.error("Must be list of dicts") + return None + + # remove any duplicate data + # this only counts something as a duplicate if every field is the same + data = remove_duplicates_by_uniques(data, natural_keys) + + # remove null data from string fields + if string_fields and isinstance(string_fields, list): + data = remove_null_characters_from_list_of_dicts(data, string_fields) + + # creates list of arguments to tell sqlalchemy what columns to return after the data is inserted + returning_args = [] + if return_columns: + for column in return_columns: + argument = getattr(table, column) + returning_args.append(argument) + + # creates insert on table + # that returns cols specificed in returning_args + # and inserts the data specified in data + # NOTE: if return_columns does not have an values this still works + stmnt = postgresql.insert(table).returning(*returning_args).values(data) + + + if on_conflict_update: + + # create a dict that the on_conflict_do_update method requires to be able to map updates whenever there is a conflict. See sqlalchemy docs for more explanation and examples: https://docs.sqlalchemy.org/en/14/dialects/postgresql.html#updating-using-the-excluded-insert-values + setDict = {} + for key in data[0].keys(): + setDict[key] = getattr(stmnt.excluded, key) + + stmnt = stmnt.on_conflict_do_update( + #This might need to change + index_elements=natural_keys, + + #Columns to be updated + set_ = setDict + ) + + else: + stmnt = stmnt.on_conflict_do_nothing( + index_elements=natural_keys + ) + + + # print(str(stmnt.compile(dialect=postgresql.dialect()))) + attempts = 0 + # creates list from 1 to 10 / changed to 10-30 because deadlocks are taking longer + sleep_time_list = list(range(10,30)) + deadlock_detected = False + + engine = get_engine() + + # if there is no data to return then it executes the insert then returns nothing + if not return_columns: + + while attempts < 10: + try: + #begin keyword is needed for sqlalchemy 2.x + #this is because autocommit support was removed in 2.0 + with engine.begin() as connection: + connection.execute(stmnt) + break + except OperationalError as e: + # print(str(e).split("Process")[1].split(";")[0]) + if isinstance(e.orig, DeadlockDetected): + deadlock_detected = True + sleep_time = random.choice(sleep_time_list) + logger.debug(f"Deadlock detected on {table.__table__} table...trying again in {round(sleep_time)} seconds: transaction size: {len(data)}") + time.sleep(sleep_time) + + attempts += 1 + continue + + raise e + + except Exception as e: + #self.logger.info(e) + if len(data) == 1: + raise e + + time.sleep(3) + first_half = data[:len(data)//2] + second_half = data[len(data)//2:] + + bulk_insert_dicts(logger, first_half, table, natural_keys, return_columns, string_fields, on_conflict_update) + bulk_insert_dicts(logger, second_half,table, natural_keys, return_columns, string_fields, on_conflict_update) + + else: + logger.error("Unable to insert data in 10 attempts") + raise Exception("Unable to insert and return data in 10 attempts") + + if deadlock_detected is True: + logger.error("Made it through even though Deadlock was detected") + + return "success" + + + # othewise it gets the requested return columns and returns them as a list of dicts + while attempts < 10: + try: + with engine.begin() as connection: + return_data_tuples = connection.execute(stmnt) + break + except OperationalError as e: + if isinstance(e.orig, DeadlockDetected): + sleep_time = random.choice(sleep_time_list) + logger.debug(f"Deadlock detected on {table.__table__} table...trying again in {round(sleep_time)} seconds: transaction size: {len(data)}") + time.sleep(sleep_time) + + attempts += 1 + continue + + raise e + + except Exception as e: + if len(data) == 1: + raise e + + time.sleep(3) + first_half = data[:len(data)//2] + second_half = data[len(data)//2:] + + bulk_insert_dicts(logger, first_half, table, natural_keys, return_columns, string_fields, on_conflict_update) + bulk_insert_dicts(logger, second_half, table, natural_keys, return_columns, string_fields, on_conflict_update) + + else: + logger.error("Unable to insert and return data in 10 attempts") + raise Exception("Unable to insert and return data in 10 attempts") + + if deadlock_detected is True: + logger.error("Made it through even though Deadlock was detected") + + return_data = [dict(row) for row in return_data_tuples.mappings()] + + #no longer working in sqlalchemy 2.x + #for data_tuple in return_data_tuples: + # return_data.append(dict(data_tuple)) + + # using on confilict do nothing does not return the + # present values so this does gets the return values + if not on_conflict_update: + + conditions = [] + for column in natural_keys: + + column_values = [value[column] for value in data] + + column = getattr(table, column) + + conditions.append(column.in_(tuple(column_values))) + + with get_session() as session: + + result = ( + session.query(table).filter(*conditions).all() + ) + + for row in result: + + return_dict = {} + for field in return_columns: + + return_dict[field] = getattr(row, field) + + return_data.append(return_dict) + + + return return_data + + + +def get_issues_by_repo_id(repo_id): + + with get_session() as session: + + return session.query(Issue).filter(Issue.repo_id == repo_id).all() + +def get_pull_requests_by_repo_id(repo_id): + + with get_session() as session: + + return session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + +def get_pull_request_reviews_by_repo_id(repo_id): + + with get_session() as session: + + return session.query(PullRequestReview).filter(PullRequestReview.repo_id == repo_id).all() + +def get_contributor_aliases_by_email(email): + + with get_session() as session: + + return session.query(ContributorsAlias).filter_by(alias_email=email).all() + +def get_unresolved_commit_emails_by_name(name): + + with get_session() as session: + + return session.query(UnresolvedCommitEmail).filter_by(name=name).all() + +def get_contributors_by_full_name(full_name): + + with get_session() as session: + + return session.query(Contributor).filter_by(cntrb_full_name=full_name).all() + +def get_contributors_by_github_user_id(id): + + with get_session() as session: + + # Look into this, where it was used was doing .all() but this query should really only return one + return session.query(Contributor).filter_by(gh_user_id=id).all() + +def update_issue_closed_cntrbs_by_repo_id(repo_id): + + engine = get_engine() + + get_ranked_issues = s.text(f""" + WITH RankedIssues AS ( + SELECT repo_id, issue_id, cntrb_id, + ROW_NUMBER() OVER(PARTITION BY issue_id ORDER BY created_at DESC) AS rn + FROM issue_events + WHERE "action" = 'closed' + ) + + SELECT issue_id, cntrb_id from RankedIssues where rn=1 and repo_id={repo_id} and cntrb_id is not NULL + """) + + with engine.connect() as conn: + result = conn.execute(get_ranked_issues).fetchall() + + update_data = [] + for row in result: + update_data.append( + { + 'issue_id': row[0], + 'cntrb_id': row[1], + 'repo_id': repo_id + } + ) + + if update_data: + with engine.connect() as connection: + update_stmt = s.text(""" + UPDATE issues + SET cntrb_id = :cntrb_id + WHERE issue_id = :issue_id + AND repo_id = :repo_id + """) + connection.execute(update_stmt, update_data) + +def get_core_data_last_collected(repo_id): + + with get_session() as session: + try: + return session.query(CollectionStatus).filter(CollectionStatus.repo_id == repo_id).one().core_data_last_collected + except s.orm.exc.NoResultFound: + return None + +def get_secondary_data_last_collected(repo_id): + + with get_session() as session: + try: + return session.query(CollectionStatus).filter(CollectionStatus.repo_id == repo_id).one().secondary_data_last_collected + except s.orm.exc.NoResultFound: + return None + +def get_updated_prs(repo_id, since): + + with get_session() as session: + return session.query(PullRequest).filter(PullRequest.repo_id == repo_id, PullRequest.pr_updated_at >= since).order_by(PullRequest.pr_src_number).all() + +def get_updated_issues(repo_id, since): + + with get_session() as session: + return session.query(Issue).filter(Issue.repo_id == repo_id, Issue.updated_at >= since).order_by(Issue.gh_issue_number).all() + + + +def get_group_by_name(user_id, group_name): + + + with get_session() as session: + + try: + user_group = session.query(UserGroup).filter(UserGroup.user_id == user_id, UserGroup.name == group_name).one() + except s.orm.exc.NoResultFound: + return None + + return user_group + +def get_repo_group_by_name(name): + + + with get_session() as session: + + return session.query(RepoGroup).filter(RepoGroup.rg_name == name).first() + \ No newline at end of file diff --git a/augur/application/db/models/__init__.py b/augur/application/db/models/__init__.py index 74e232fed4..013f22ab42 100644 --- a/augur/application/db/models/__init__.py +++ b/augur/application/db/models/__init__.py @@ -63,6 +63,7 @@ PullRequestTeam, PullRequestRepo, PullRequestReviewMessageRef, + CommitMessage, RepoClone, ) diff --git a/augur/application/db/models/augur_data.py b/augur/application/db/models/augur_data.py index d938391087..9212bcc5e9 100644 --- a/augur/application/db/models/augur_data.py +++ b/augur/application/db/models/augur_data.py @@ -22,13 +22,17 @@ from sqlalchemy.orm import relationship from sqlalchemy.sql import text from sqlalchemy.orm.exc import MultipleResultsFound, NoResultFound +from time import sleep, mktime, gmtime, time, localtime import logging import re import json +import urllib.parse from augur.application.db.models.base import Base from augur.application.db.util import execute_session_query +from augur.application.db import get_session + DEFAULT_REPO_GROUP_ID = 1 metadata = Base.metadata @@ -587,12 +591,13 @@ def is_valid_repo_group_id(session, repo_group_id: int) -> bool: @staticmethod def get_by_name(session, rg_name): - query = session.query(RepoGroup).filter(RepoGroup.rg_name == rg_name) + with get_session() as session: - try: - result = execute_session_query(query, 'one') - except NoResultFound: - return None + try: + query = session.query(RepoGroup).filter(RepoGroup.rg_name == rg_name) + result = execute_session_query(query, 'one') + except NoResultFound: + return None return result @@ -865,6 +870,7 @@ class Repo(Base): data_collection_date = Column( TIMESTAMP(precision=0), server_default=text("CURRENT_TIMESTAMP") ) + repo_src_id = Column(BigInteger) repo_group = relationship("RepoGroup", back_populates="repo") user_repo = relationship("UserRepo", back_populates="repo") @@ -878,7 +884,13 @@ class Repo(Base): @staticmethod def get_by_id(session, repo_id): - return session.query(Repo).filter(Repo.repo_id == repo_id).first() + try: + return session.query(Repo).filter(Repo.repo_id == repo_id).first() + except Exception as e: + session.rollback() + raise e + + @staticmethod def get_by_repo_git(session, repo_git): @@ -918,6 +930,20 @@ def is_valid_github_repo(gh_session, url: str) -> bool: continue data = result.json() + if result.status_code == 403: #GH Rate limiting + wait_until = int(result.headers.get("x-ratelimit-reset")) + # use time package to find how many seconds to wait + wait_in_seconds = int( + mktime(gmtime(wait_until)) - + mktime(gmtime(time())) + ) + wait_until_time = localtime(wait_until) + logger.error(f"rate limited fetching {url}") + logger.error(f"sleeping until {wait_until_time.tm_hour}:{wait_until_time.tm_min} ({wait_in_seconds} seconds)") + sleep(wait_in_seconds) + attempts+=1 + continue + # if there was an error return False if "message" in data.keys(): @@ -928,6 +954,8 @@ def is_valid_github_repo(gh_session, url: str) -> bool: return True, {"status": "Valid repo", "repo_type": data["owner"]["type"]} + return False, {"status": "Failed to validate repo after multiple attempts"} + @staticmethod def is_valid_gitlab_repo(gl_session, url: str) -> bool: """Determine whether a GitLab repo URL is valid. @@ -948,13 +976,18 @@ def is_valid_gitlab_repo(gl_session, url: str) -> bool: return False, {"status": "Invalid repo URL"} # Encode namespace and project name for the API request - project_identifier = f"{owner}%2F{repo}" + project_identifier = urllib.parse.quote(f"{owner}/{repo}", safe='') url = REPO_ENDPOINT.format(project_identifier) attempts = 0 while attempts < 10: response = hit_api(gl_session.oauths, url, logger) + if wait_in_seconds := response.headers.get("Retry-After") is not None: + logger.info(f"rate limited fetching {url}, sleeping for {wait_in_seconds}") + print(f"rate limited fetching {url}, sleeping for {wait_in_seconds}") + sleep(int(wait_in_seconds)) + if response.status_code == 404: return False, {"status": "Invalid repo"} @@ -962,6 +995,8 @@ def is_valid_gitlab_repo(gl_session, url: str) -> bool: return True, {"status": "Valid repo"} attempts += 1 + logger.info(f"could not validate {url}, will attempt again in {attempts*5} seconds") + sleep(attempts*3) return False, {"status": "Failed to validate repo after multiple attempts"} @@ -1000,7 +1035,7 @@ def parse_gitlab_repo_url(url: str) -> tuple: Tuple of owner and repo. Or a tuple of None and None if the url is invalid. """ - result = re.search(r"https?:\/\/gitlab\.com\/([A-Za-z0-9 \- _]+)\/([A-Za-z0-9 \- _ \.]+)(.git)?\/?$", url) + result = re.search(r"https?:\/\/gitlab\.com\/([A-Za-z0-9\-_\/]+)\/([A-Za-z0-9\-_]+)(\.git)?\/?$", url) if not result: return None, None @@ -1033,7 +1068,7 @@ def parse_github_org_url(url): return result.groups()[0] @staticmethod - def insert_gitlab_repo(session, url: str, repo_group_id: int, tool_source): + def insert_gitlab_repo(session, url: str, repo_group_id: int, tool_source, repo_src_id = None): """Add a repo to the repo table. Args: @@ -1067,7 +1102,8 @@ def insert_gitlab_repo(session, url: str, repo_group_id: int, tool_source): "repo_type": None, "tool_source": tool_source, "tool_version": "1.0", - "data_source": "Git" + "data_source": "Git", + "repo_src_id": repo_src_id } repo_unique = ["repo_git"] @@ -1080,7 +1116,7 @@ def insert_gitlab_repo(session, url: str, repo_group_id: int, tool_source): return result[0]["repo_id"] @staticmethod - def insert_github_repo(session, url: str, repo_group_id: int, tool_source, repo_type): + def insert_github_repo(session, url: str, repo_group_id: int, tool_source, repo_type, repo_src_id = None): """Add a repo to the repo table. Args: @@ -1115,7 +1151,8 @@ def insert_github_repo(session, url: str, repo_group_id: int, tool_source, repo_ "repo_type": repo_type, "tool_source": tool_source, "tool_version": "1.0", - "data_source": "Git" + "data_source": "Git", + "repo_src_id": repo_src_id } repo_unique = ["repo_git"] @@ -1320,6 +1357,36 @@ class Commit(Base): repo = relationship("Repo", back_populates="commits") message_ref = relationship("CommitCommentRef", back_populates="cmt") +class CommitMessage(Base): + __tablename__ = "commit_messages" + __table_args__ = ( UniqueConstraint("repo_id","cmt_hash", name="commit-message-insert-unique"), + { + "schema": "augur_data", + "comment": "This table holds commit messages", + } + ) + + cmt_msg_id = Column( + BigInteger, + primary_key=True, + server_default=text("nextval('augur_data.commits_cmt_id_seq'::regclass)"), + ) + + repo_id = Column( + ForeignKey("augur_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE"), + nullable=False, + ) + + cmt_msg = Column(String, nullable=False) + + cmt_hash = Column(String(80), nullable=False) + + tool_source = Column(String) + tool_version = Column(String) + data_source = Column(String) + data_collection_date = Column( + TIMESTAMP(precision=0), server_default=text("CURRENT_TIMESTAMP") + ) class Issue(Base): __tablename__ = "issues" @@ -1474,7 +1541,7 @@ class LstmAnomalyResult(Base): class Message(Base): __tablename__ = "message" __table_args__ = ( - UniqueConstraint("platform_msg_id", name="message-insert-unique"), + UniqueConstraint("platform_msg_id", "pltfrm_id", name="message-insert-unique"), Index("msg-cntrb-id-idx", "cntrb_id"), Index("platformgrouper", "msg_id", "pltfrm_id"), Index("messagegrouper", "msg_id", "rgls_id", unique=True), diff --git a/augur/application/db/models/augur_operations.py b/augur/application/db/models/augur_operations.py index 47f28b12f2..bb29c7571a 100644 --- a/augur/application/db/models/augur_operations.py +++ b/augur/application/db/models/augur_operations.py @@ -30,23 +30,29 @@ def retrieve_owner_repos(session, owner: str) -> List[str]: Returns List of valid repo urls or empty list if invalid org """ - from augur.tasks.github.util.github_paginator import GithubPaginator, retrieve_dict_from_endpoint + from augur.tasks.github.util.github_data_access import GithubDataAccess, UrlNotFoundException OWNER_INFO_ENDPOINT = f"https://api.github.com/users/{owner}" ORG_REPOS_ENDPOINT = f"https://api.github.com/orgs/{owner}/repos?per_page=100" USER_REPOS_ENDPOINT = f"https://api.github.com/users/{owner}/repos?per_page=100" + github_data_access = GithubDataAccess(session.oauths, logger) + if not session.oauths.list_of_keys: return None, {"status": "No valid github api keys to retrieve data with"} # determine whether the owner is a user or an organization - data, _ = retrieve_dict_from_endpoint(logger, session.oauths, OWNER_INFO_ENDPOINT) - if not data: + try: + data = github_data_access.get_resource(OWNER_INFO_ENDPOINT) + except UrlNotFoundException as e: + logger.error("Owner not found on github") return None, {"status": "Invalid owner"} + except Exception as e: + logger.error(f"Failed to get owner data from github. Exception: {e}") + return None, {"status": "Failed to get owner data from github"} owner_type = data["type"] - if owner_type == "User": url = USER_REPOS_ENDPOINT elif owner_type == "Organization": @@ -54,15 +60,8 @@ def retrieve_owner_repos(session, owner: str) -> List[str]: else: return None, {"status": f"Invalid owner type: {owner_type}"} - # collect repo urls for the given owner - repos = [] - for page_data, _ in GithubPaginator(url, session.oauths, logger).iter_pages(): - - if page_data is None: - break - - repos.extend(page_data) + repos = list(github_data_access.paginate_resource(url)) repo_urls = [repo["html_url"] for repo in repos] @@ -329,6 +328,9 @@ def get_user(session, username: str): return user except NoResultFound: return None + except Exception as e: + session.rollback() + raise e @staticmethod def get_by_id(session, user_id: int): @@ -1073,7 +1075,13 @@ def __eq__(self, other): @staticmethod def get_by_id(session, client_id): - return session.query(ClientApplication).filter(ClientApplication.id == client_id).first() + + try: + return session.query(ClientApplication).filter(ClientApplication.id == client_id).first() + except Exception as e: + session.rollback() + raise e + class Subscription(Base): __tablename__ = "subscriptions" @@ -1224,7 +1232,7 @@ class CollectionStatus(Base): repo = relationship("Repo", back_populates="collection_status") @staticmethod - def insert(session, repo_id): + def insert(session, logger, repo_id): from augur.tasks.github.util.util import get_repo_weight_by_issue from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps @@ -1237,13 +1245,13 @@ def insert(session, repo_id): if "github" in repo_git: try: - pr_issue_count = get_repo_weight_by_issue(session.logger, repo_git) + pr_issue_count = get_repo_weight_by_issue(logger, repo_git) #session.logger.info(f"date weight: {calculate_date_weight_from_timestamps(repo.repo_added, None)}") github_weight = pr_issue_count - calculate_date_weight_from_timestamps(repo.repo_added, None) except Exception as e: pr_issue_count = None github_weight = None - session.logger.error( + logger.error( ''.join(traceback.format_exception(None, e, e.__traceback__))) else: try: @@ -1252,7 +1260,7 @@ def insert(session, repo_id): except Exception as e: pr_issue_count = None github_weight = None - session.logger.error( + logger.error( ''.join(traceback.format_exception(None, e, e.__traceback__))) @@ -1267,7 +1275,7 @@ def insert(session, repo_id): result = session.insert_data(record, CollectionStatus, collection_status_unique, on_conflict_update=False) - session.logger.info(f"Trying to insert repo \n issue and pr sum: {record['issue_pr_sum']}") + logger.info(f"Trying to insert repo \n issue and pr sum: {record['issue_pr_sum']}") if not result: return False diff --git a/augur/application/logs.py b/augur/application/logs.py index 11e1cb6ea5..1fb8709b82 100644 --- a/augur/application/logs.py +++ b/augur/application/logs.py @@ -36,12 +36,29 @@ def getFormatter(logLevel): return logging.Formatter(fmt=ERROR_FORMAT_STRING) # create a file handler and set the format and log level -def create_file_handler(file, formatter, level): - handler = FileHandler(filename=file, mode='a') - handler.setFormatter(fmt=formatter) - handler.setLevel(level) +# def create_file_handler(file, formatter, level): +# handler = FileHandler(filename=file, mode='a') +# handler.setFormatter(fmt=formatter) +# handler.setLevel(level) - return handler +# return handler + +def create_file_handler(file, formatter, level): + try: + # Ensure the directory exists + directory = os.path.dirname(file) + if not os.path.exists(directory): + os.makedirs(directory) + + # Create the file handler + handler = logging.FileHandler(filename=file, mode='a') + handler.setFormatter(formatter) + handler.setLevel(level) + + return handler + except Exception as e: + print(f"Failed to create file handler: {e}") + return None # function to create two file handlers and add them to a logger def initialize_file_handlers(logger, file, log_level): @@ -181,7 +198,12 @@ def __init__(self, logger_name, disable_log_files=False,reset_logfiles=False,bas if reset_logfiles is True: try: print("(augur) Reseting log files") - shutil.rmtree(base_log_dir) + base_log_dir_path = Path(base_log_dir) + for item in base_log_dir_path.iterdir(): + if item.is_dir(): + shutil.rmtree(item, ignore_errors=True) + else: + item.unlink(missing_ok=True) except FileNotFoundError as e: pass @@ -219,4 +241,3 @@ def __str__(self): def get_logger(self): return self.lg - diff --git a/augur/application/schema/alembic/versions/27_update_messages_unique.py b/augur/application/schema/alembic/versions/27_update_messages_unique.py new file mode 100644 index 0000000000..9c60349412 --- /dev/null +++ b/augur/application/schema/alembic/versions/27_update_messages_unique.py @@ -0,0 +1,33 @@ +""" Update messages unique + +Revision ID: 27 +Revises: 26 +Create Date: 2024-03-10 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy import text + +# revision identifiers, used by Alembic. +revision = '27' +down_revision = '26' +branch_labels = None +depends_on = None + + +schema_name = 'augur_data' +table_name = "message" +constraint_name = "message-insert-unique" + +def upgrade(): + + op.drop_constraint(constraint_name, table_name, schema=schema_name, type_='unique') + + op.create_unique_constraint(constraint_name, table_name, ['platform_msg_id', 'pltfrm_id'], schema=schema_name) + +def downgrade(): + + op.drop_constraint(constraint_name, table_name, schema=schema_name, type_='unique') + + op.create_unique_constraint(constraint_name, table_name, ['platform_msg_id'], schema=schema_name) diff --git a/augur/application/schema/alembic/versions/28_Performance_Indexes_a.py b/augur/application/schema/alembic/versions/28_Performance_Indexes_a.py new file mode 100644 index 0000000000..906cb2c121 --- /dev/null +++ b/augur/application/schema/alembic/versions/28_Performance_Indexes_a.py @@ -0,0 +1,93 @@ +""" Updating materialized views and associated indices + +Revision ID: 28 +Revises: 27 +Create Date: 2023-08-23 18:17:22.651191 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql +from sqlalchemy import text + +# revision identifiers, used by Alembic. +revision = '28' +down_revision = '27' +branch_labels = None +depends_on = None + +def upgrade(): + + mview_keys_28() + +def mview_keys_28(upgrade=True): + + if upgrade: + conn = op.get_bind() + conn.execute(text(""" + + DROP INDEX if exists "pr_ID_prs_table"; + DROP INDEX if exists "pr_id_pr_files"; + DROP INDEX if exists "pr_id_pr_reviews"; + DROP materialized view if exists augur_data.explorer_repo_languages; + + + + CREATE INDEX "pr_ID_prs_table" ON "augur_data"."pull_requests" USING btree ( + "pull_request_id" "pg_catalog"."int8_ops" ASC NULLS LAST + ); + + CREATE INDEX "pr_id_pr_files" ON "augur_data"."pull_request_files" USING btree ( + "pull_request_id" "pg_catalog"."int8_ops" ASC NULLS LAST + ); + + CREATE INDEX "pr_id_pr_reviews" ON "augur_data"."pull_request_reviews" USING btree ( + "pull_request_id" "pg_catalog"."int8_ops" ASC NULLS LAST + );""")) + + conn = op.get_bind() + conn.execute(text(""" + CREATE MATERIALIZED VIEW augur_data.explorer_repo_languages as + SELECT e.repo_id, + repo.repo_git, + repo.repo_name, + e.programming_language, + e.code_lines, + e.files + FROM augur_data.repo, + ( SELECT d.repo_id, + d.programming_language, + sum(d.code_lines) AS code_lines, + (count(*))::integer AS files + FROM ( SELECT repo_labor.repo_id, + repo_labor.programming_language, + repo_labor.code_lines + FROM augur_data.repo_labor, + ( SELECT repo_labor_1.repo_id, + max(repo_labor_1.data_collection_date) AS last_collected + FROM augur_data.repo_labor repo_labor_1 + GROUP BY repo_labor_1.repo_id) recent + WHERE ((repo_labor.repo_id = recent.repo_id) AND (repo_labor.data_collection_date > (recent.last_collected - ((5)::double precision * '00:01:00'::interval))))) d + GROUP BY d.repo_id, d.programming_language) e + WHERE (repo.repo_id = e.repo_id) + ORDER BY e.repo_id;""")) + + conn.execute(text("""COMMIT;""")) + + conn = op.get_bind() + conn.execute(text(""" + CREATE UNIQUE INDEX ON augur_data.explorer_repo_languages(repo_id, programming_language); """)) + conn.execute(text("""COMMIT;""")) +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + conn = op.get_bind() + + #Make unique initially deferred + conn.execute(text(f""" + DROP INDEX if exists "pr_ID_prs_table"; + DROP INDEX if exists "pr_id_pr_files"; + DROP INDEX if exists "pr_id_pr_reviews"; + DROP materialized view if exists augur_data.explorer_repo_languages; + """)) + + # ### end Alembic commands ### diff --git a/augur/application/schema/alembic/versions/29_add_commit_message_table.py b/augur/application/schema/alembic/versions/29_add_commit_message_table.py new file mode 100644 index 0000000000..11f3fef554 --- /dev/null +++ b/augur/application/schema/alembic/versions/29_add_commit_message_table.py @@ -0,0 +1,42 @@ +"""Add commit message table + +Revision ID: 29 +Revises: 28 +Create Date: 2024-07-25 12:02:57.185867 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = '29' +down_revision = '28' +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('commit_messages', + sa.Column('cmt_msg_id', sa.BigInteger(), server_default=sa.text("nextval('augur_data.commits_cmt_id_seq'::regclass)"), nullable=False), + sa.Column('repo_id', sa.BigInteger(), nullable=False), + sa.Column('cmt_msg', sa.String(), nullable=False), + sa.Column('cmt_hash', sa.String(length=80), nullable=False), + sa.Column('tool_source', sa.String(), nullable=True), + sa.Column('tool_version', sa.String(), nullable=True), + sa.Column('data_source', sa.String(), nullable=True), + sa.Column('data_collection_date', postgresql.TIMESTAMP(precision=0), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=True), + sa.ForeignKeyConstraint(['repo_id'], ['augur_data.repo.repo_id'], onupdate='CASCADE', ondelete='RESTRICT'), + sa.PrimaryKeyConstraint('cmt_msg_id'), + sa.UniqueConstraint('repo_id', 'cmt_hash', name='commit-message-insert-unique'), + schema='augur_data', + comment='This table holds commit messages' + ) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table('commit_messages', schema='augur_data') + # ### end Alembic commands ### diff --git a/augur/application/schema/alembic/versions/30_add_repo_src_id.py b/augur/application/schema/alembic/versions/30_add_repo_src_id.py new file mode 100644 index 0000000000..013890697e --- /dev/null +++ b/augur/application/schema/alembic/versions/30_add_repo_src_id.py @@ -0,0 +1,25 @@ +"""Add repo src id + +Revision ID: 30 +Revises: 29 +Create Date: 2024-08-30 + +""" +from alembic import op +import sqlalchemy as sa + +# revision identifiers, used by Alembic. +revision = '30' +down_revision = '29' +branch_labels = None +depends_on = None + + +def upgrade(): + op.add_column('repo', sa.Column('repo_src_id', sa.BigInteger(), nullable=True), schema='augur_data') + op.create_unique_constraint('repo_src_id_unique', 'repo', ['repo_src_id'], schema='augur_data') + + +def downgrade(): + op.drop_constraint('repo_src_id_unique', 'repo', schema='augur_data', type_='unique') + op.drop_column('repo', 'repo_src_id', schema='augur_data') diff --git a/augur/application/schema/alembic/versions/31_update_pr_events_unique.py b/augur/application/schema/alembic/versions/31_update_pr_events_unique.py new file mode 100644 index 0000000000..b55b60a09a --- /dev/null +++ b/augur/application/schema/alembic/versions/31_update_pr_events_unique.py @@ -0,0 +1,83 @@ +"""Update pr events unique + +Revision ID: 31 +Revises: 30 +Create Date: 2025-03-08 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy import text +from augur.application.db import create_database_engine, get_database_string + + +# revision identifiers, used by Alembic. +revision = '31' +down_revision = '30' +branch_labels = None +depends_on = None + + + # conn = op.get_bind() + # conn.execute(text(""" + # UPDATE pull_request_events + # SET issue_event_src_id = substring(node_url FROM '.*/([0-9]+)$')::BIGINT; + # """)) + + +def upgrade(): + + connection_string = get_database_string() + engine = create_database_engine(connection_string) + + with engine.connect() as conn: + + result = conn.execute(text("SELECT COUNT(*) FROM pull_request_events WHERE issue_event_src_id=pr_platform_event_id")) + total_rows = result.scalar() + if total_rows != 0: + print(f"Rows needing updated: {total_rows}") + print(f"0.0% complete") + total_updated = 0 + + while True: + result = conn.execute(text(""" + WITH cte AS ( + SELECT pr_event_id + FROM pull_request_events + WHERE issue_event_src_id=pr_platform_event_id + LIMIT 250000 + ) + UPDATE pull_request_events + SET issue_event_src_id = substring(node_url FROM '.*/([0-9]+)$')::BIGINT + FROM cte + WHERE pull_request_events.pr_event_id = cte.pr_event_id + RETURNING 1; + """)) + + conn.commit() + + rows_updated = result.rowcount + total_updated += rows_updated + + if rows_updated == 0: + print(f"Update complete") + break + + percentage_updated = (total_updated / total_rows) * 100 + + print(f"{percentage_updated:.1f}% complete ({total_rows-total_updated} rows left)") + + print("Creating (repo_id, issue_event_src_id) index") + op.create_unique_constraint('pr_events_repo_id_event_src_id_unique', 'pull_request_events', ['repo_id', 'issue_event_src_id'], schema='augur_data') + + +def downgrade(): + op.drop_constraint('pr_events_repo_id_event_src_id_unique', 'pull_request_events', schema='augur_data', type_='unique') + + print("Please run in background. This downgrade will take a very *very* long time") + conn = op.get_bind() + conn.execute(text(""" + UPDATE pull_request_events + SET issue_event_src_id = pr_platform_event_id + WHERE issue_event_src_id <> pr_platform_event_id; + """)) \ No newline at end of file diff --git a/augur/static/css/first_time.css b/augur/static/css/first_time.css index 12f8ae9f54..f2d4602399 100644 --- a/augur/static/css/first_time.css +++ b/augur/static/css/first_time.css @@ -1,50 +1,102 @@ +:root { + --color-bg: #1A233A; + --color-bg-light: #272E48; + --color-bg-contrast: #646683; + --color-fg: white; + --color-fg-dark: #b0bdd6; + --color-fg-contrast: black; + --color-accent: #6f42c1; + --color-accent-dark: #6134b3; + --color-notice: #00ddff; + --color-notice-contrast: #006979; +} + body{ margin-top:20px; - color: #bcd0f7; - background: #1A233A; + background-color: var(--color-bg); + color: var(--color-fg); } + h1 { font-size: 2rem; } + .sidebar .sidebar-top { margin: 0 0 1rem 0; padding-bottom: 1rem; text-align: center; } + .sidebar .sidebar-top .brand-logo { margin: 0 0 1rem 0; } + .sidebar .sidebar-top .brand-logo img { height: 90px; -webkit-border-radius: 100px; -moz-border-radius: 100px; border-radius: 100px; } + .sidebar .about { margin: 1rem 0 0 0; font-size: 0.8rem; text-align: center; } + +.subtitle { + color: var(--color-fg-dark); + margin-bottom: .5rem; + margin-left: 15px; +} + +.no-margin-bottom { + margin-bottom: 0; +} + .card { - background: #272E48; + background: var(--color-bg-light); -webkit-border-radius: 5px; -moz-border-radius: 5px; border-radius: 5px; border: 0; margin-bottom: 1rem; } + .form-control { border: 1px solid #596280; -webkit-border-radius: 2px; -moz-border-radius: 2px; border-radius: 2px; font-size: .825rem; - background: #1A233A; - color: #bcd0f7; + background: var(--color-bg-light); + color: var(--color-fg); +} + +.input-textbox { + color: var(--color-fg); + background-color: var(--color-bg); + border-color: var(--color-accent-dark); } + +.input-textbox::placeholder { + color: var(--color-fg-dark); +} + +.input-textbox:focus { + color: var(--color-fg); + background-color: var(--color-bg); + border-color: var(--color-accent-dark); +} + +.input-textbox:focus::placeholder { + color: var(--color-fg-dark); +} + .modal-content { - color: black; + color: var(--color-fg-contrast); } + .editor-container { height: 300px !important; } diff --git a/augur/tasks/data_analysis/__init__.py b/augur/tasks/data_analysis/__init__.py index b600bcac77..0db9a97ec4 100644 --- a/augur/tasks/data_analysis/__init__.py +++ b/augur/tasks/data_analysis/__init__.py @@ -1,7 +1,7 @@ from celery import chain import logging -def machine_learning_phase(repo_git): +def machine_learning_phase(repo_git, full_collection): from augur.tasks.data_analysis.clustering_worker.tasks import clustering_task from augur.tasks.data_analysis.discourse_analysis.tasks import discourse_analysis_task from augur.tasks.data_analysis.insight_worker.tasks import insight_task @@ -15,7 +15,7 @@ def machine_learning_phase(repo_git): ml_tasks.append(discourse_analysis_task.si(repo_git)) ml_tasks.append(insight_task.si(repo_git)) ml_tasks.append(message_insight_task.si(repo_git)) - ml_tasks.append(pull_request_analysis_task.si(repo_git)) + #ml_tasks.append(pull_request_analysis_task.si(repo_git)) logger.info(f"Machine learning sequence: {ml_tasks}") return chain(*ml_tasks) \ No newline at end of file diff --git a/augur/tasks/data_analysis/clustering_worker/setup.py b/augur/tasks/data_analysis/clustering_worker/setup.py deleted file mode 100644 index 78fb0b4b50..0000000000 --- a/augur/tasks/data_analysis/clustering_worker/setup.py +++ /dev/null @@ -1,45 +0,0 @@ -import io -import os -import re - -from setuptools import find_packages -from setuptools import setup - -def read(filename): - filename = os.path.join(os.path.dirname(__file__), filename) - text_type = type(u"") - with io.open(filename, mode="r", encoding='utf-8') as fd: - return re.sub(text_type(r':[a-z]+:`~?(.*?)`'), text_type(r'``\1``'), fd.read()) - -setup( - name="clustering_worker", - version="0.0.2", - url="https://github.com/chaoss/augur", - license='MIT', - author="Sarit Adhikari", - author_email="sarit.adhikari@gmail.com", - description="worker to cluster repository based on messages on issues and pull requests ", - packages=find_packages(), - install_requires=[ - 'Flask==2.0.2', - 'Flask-Cors==3.0.10', - 'Flask-Login==0.5.0', - 'Flask-WTF==1.0.0', - 'requests==2.28.0', - 'psycopg2-binary==2.9.3', - #'sklearn==0.0.0', - 'scikit-learn==1.1.3', - 'numpy==1.26.0', - 'nltk==3.6.6', - 'seaborn==0.11.1', - 'pandas==1.5.3', - 'matplotlib>=3.5.1' - ], - classifiers=[ - 'Development Status :: 2 - Pre-Alpha', - 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.7', - ] -) diff --git a/augur/tasks/data_analysis/clustering_worker/tasks.py b/augur/tasks/data_analysis/clustering_worker/tasks.py index e59951ab0e..d548ecf108 100644 --- a/augur/tasks/data_analysis/clustering_worker/tasks.py +++ b/augur/tasks/data_analysis/clustering_worker/tasks.py @@ -20,10 +20,8 @@ from collections import Counter from augur.tasks.init.celery_app import celery_app as celery -from augur.application.db.session import DatabaseSession -from augur.application.db.lib import get_value -from augur.application.db.models import Repo, RepoClusterMessage, RepoTopic, TopicWord -from augur.application.db.util import execute_session_query +from augur.application.db.lib import get_value, get_session, get_repo_by_repo_git +from augur.application.db.models import RepoClusterMessage, RepoTopic, TopicWord from augur.tasks.init.celery_app import AugurMlRepoCollectionTask @@ -37,10 +35,9 @@ def clustering_task(self, repo_git): logger = logging.getLogger(clustering_model.__name__) engine = self.app.engine - with DatabaseSession(logger, engine) as session: - clustering_model(repo_git, logger, engine, session) + clustering_model(repo_git, logger, engine) -def clustering_model(repo_git: str,logger,engine, session) -> None: +def clustering_model(repo_git: str,logger,engine) -> None: logger.info(f"Starting clustering analysis for {repo_git}") @@ -56,8 +53,7 @@ def clustering_model(repo_git: str,logger,engine, session) -> None: tool_version = '0.2.0' data_source = 'Augur Collected Messages' - query = session.query(Repo).filter(Repo.repo_git == repo_git) - repo_id = execute_session_query(query, 'one').repo_id + repo_id = get_repo_by_repo_git(repo_git).repo_id num_clusters = get_value("Clustering_Task", 'num_clusters') max_df = get_value("Clustering_Task", 'max_df') @@ -123,7 +119,7 @@ def clustering_model(repo_git: str,logger,engine, session) -> None: # check if dumped pickle file exists, if exists no need to train the model if not os.path.exists(MODEL_FILE_NAME): logger.info("clustering model not trained. Training the model.........") - train_model(logger, engine, session, max_df, min_df, max_features, ngram_range, num_clusters, num_topics, num_words_per_topic, tool_source, tool_version, data_source) + train_model(logger, engine, max_df, min_df, max_features, ngram_range, num_clusters, num_topics, num_words_per_topic, tool_source, tool_version, data_source) else: model_stats = os.stat(MODEL_FILE_NAME) model_age = (time.time() - model_stats.st_mtime) @@ -131,7 +127,7 @@ def clustering_model(repo_git: str,logger,engine, session) -> None: logger.debug(f'model age is: {model_age}') if model_age > 2000000: logger.info("clustering model to old. Retraining the model.........") - train_model(logger, engine, session, max_df, min_df, max_features, ngram_range, num_clusters, num_topics, num_words_per_topic, tool_source, tool_version, data_source) + train_model(logger, engine, max_df, min_df, max_features, ngram_range, num_clusters, num_topics, num_words_per_topic, tool_source, tool_version, data_source) else: logger.info("using pre-trained clustering model....") @@ -162,18 +158,20 @@ def clustering_model(repo_git: str,logger,engine, session) -> None: prediction = kmeans_model.predict(feature_matrix_cur_repo) logger.info("prediction: " + str(prediction[0])) - # inserting data - record = { - 'repo_id': int(repo_id), - 'cluster_content': int(prediction[0]), - 'cluster_mechanism': -1, - 'tool_source': tool_source, - 'tool_version': tool_version, - 'data_source': data_source - } - repo_cluster_messages_obj = RepoClusterMessage(**record) - session.add(repo_cluster_messages_obj) - session.commit() + with get_session() as session: + + # inserting data + record = { + 'repo_id': int(repo_id), + 'cluster_content': int(prediction[0]), + 'cluster_mechanism': -1, + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source + } + repo_cluster_messages_obj = RepoClusterMessage(**record) + session.add(repo_cluster_messages_obj) + session.commit() # result = db.execute(repo_cluster_messages_table.insert().values(record)) logging.info( @@ -197,22 +195,24 @@ def clustering_model(repo_git: str,logger,engine, session) -> None: logger.debug('prediction vocab') prediction = lda_model.transform(count_matrix_cur_repo) - logger.debug('for loop for vocab') - for i, prob_vector in enumerate(prediction): - # repo_id = msg_df.loc[i]['repo_id'] - for i, prob in enumerate(prob_vector): - record = { - 'repo_id': int(repo_id), - 'topic_id': i + 1, - 'topic_prob': prob, - 'tool_source': tool_source, - 'tool_version': tool_version, - 'data_source': data_source - } - - repo_topic_object = RepoTopic(**record) - session.add(repo_topic_object) - session.commit() + with get_session() as session: + + logger.debug('for loop for vocab') + for i, prob_vector in enumerate(prediction): + # repo_id = msg_df.loc[i]['repo_id'] + for i, prob in enumerate(prob_vector): + record = { + 'repo_id': int(repo_id), + 'topic_id': i + 1, + 'topic_prob': prob, + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source + } + + repo_topic_object = RepoTopic(**record) + session.add(repo_topic_object) + session.commit() # result = db.execute(repo_topic_table.insert().values(record)) except Exception as e: @@ -260,7 +260,7 @@ def preprocess_and_tokenize(text): stems = [stemmer.stem(t) for t in tokens] return stems -def train_model(logger, engine, session, max_df, min_df, max_features, ngram_range, num_clusters, num_topics, num_words_per_topic, tool_source, tool_version, data_source): +def train_model(logger, engine, max_df, min_df, max_features, ngram_range, num_clusters, num_topics, num_words_per_topic, tool_source, tool_version, data_source): def visualize_labels_PCA(features, labels, annotations, num_components, title): labels_color_map = {-1: "red"} for label in labels: @@ -372,32 +372,35 @@ def visualize_labels_PCA(features, labels, annotations, num_components, title): # twid = self.db.execute(key_sequence_words_sql) # logger.info("twid variable is: {}".format(twid)) # insert topic list into database - topic_id = 1 - for topic in topic_list: - # twid = self.get_max_id('topic_words', 'topic_words_id') + 1 - # logger.info("twid variable is: {}".format(twid)) - for i in topic.argsort()[:-num_words_per_topic - 1:-1]: - # twid+=1 - # logger.info("in loop incremented twid variable is: {}".format(twid)) + + with get_session() as session: + + topic_id = 1 + for topic in topic_list: + # twid = self.get_max_id('topic_words', 'topic_words_id') + 1 # logger.info("twid variable is: {}".format(twid)) - record = { - # 'topic_words_id': twid, - # 'word_prob': word_prob[i], - 'topic_id': int(topic_id), - 'word': feature_names[i], - 'tool_source': tool_source, - 'tool_version': tool_version, - 'data_source': data_source - } - - topic_word_obj = TopicWord(**record) - session.add(topic_word_obj) - session.commit() - - # result = db.execute(topic_words_table.insert().values(record)) - logger.info( - "Primary key inserted into the topic_words table: {}".format(topic_word_obj.topic_words_id)) - topic_id += 1 + for i in topic.argsort()[:-num_words_per_topic - 1:-1]: + # twid+=1 + # logger.info("in loop incremented twid variable is: {}".format(twid)) + # logger.info("twid variable is: {}".format(twid)) + record = { + # 'topic_words_id': twid, + # 'word_prob': word_prob[i], + 'topic_id': int(topic_id), + 'word': feature_names[i], + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source + } + + topic_word_obj = TopicWord(**record) + session.add(topic_word_obj) + session.commit() + + # result = db.execute(topic_words_table.insert().values(record)) + logger.info( + "Primary key inserted into the topic_words table: {}".format(topic_word_obj.topic_words_id)) + topic_id += 1 # insert topic list into database diff --git a/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py b/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py index 15660e763b..896ccd61d1 100644 --- a/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py +++ b/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py @@ -4,9 +4,10 @@ from datetime import datetime from augur.tasks.init.celery_app import celery_app as celery -from augur.tasks.github.util.github_task_session import GithubTaskManifest -from augur.tasks.github.util.github_paginator import GithubPaginator +from augur.tasks.github.util.github_data_access import GithubDataAccess, UrlNotFoundException from augur.application.db.models import ContributorRepo +from augur.application.db.lib import bulk_insert_dicts +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth ### This worker scans all the platform users in Augur, and pulls their platform activity ### logs. Those are then used to analyze what repos each is working in (which will include repos not @@ -25,6 +26,8 @@ def contributor_breadth_model(self) -> None: tool_version = '0.0.1' data_source = 'GitHub API' + key_auth = GithubRandomKeyAuth(logger) + # This version of the query pulls contributors who have not had any data collected yet # To the top of the list cntrb_login_query = s.sql.text(""" @@ -60,7 +63,6 @@ def contributor_breadth_model(self) -> None: current_cntrb_logins = [dict(row) for row in result.mappings()] - cntrb_newest_events_query = s.sql.text(""" SELECT c.gh_login, MAX(cr.created_at) as newest_event_date FROM contributor_repo AS cr @@ -81,43 +83,46 @@ def contributor_breadth_model(self) -> None: cntrb_newest_events_map[gh_login] = newest_event_date + github_data_access = GithubDataAccess(key_auth, logger) - with GithubTaskManifest(logger) as manifest: + index = 1 + total = len(current_cntrb_logins) + for cntrb in current_cntrb_logins: - index = 1 - total = len(current_cntrb_logins) - for cntrb in current_cntrb_logins: + print(f"Processing cntrb {index} of {total}") + index += 1 - print(f"Processing cntrb {index} of {total}") - index += 1 + repo_cntrb_url = f"https://api.github.com/users/{cntrb['gh_login']}/events" - repo_cntrb_url = f"https://api.github.com/users/{cntrb['gh_login']}/events" + newest_event_in_db = datetime(1970, 1, 1) + if cntrb["gh_login"] in cntrb_newest_events_map: + newest_event_in_db = cntrb_newest_events_map[cntrb["gh_login"]] + - newest_event_in_db = datetime(1970, 1, 1) - if cntrb["gh_login"] in cntrb_newest_events_map: - newest_event_in_db = cntrb_newest_events_map[cntrb["gh_login"]] - + cntrb_events = [] + try: + for event in github_data_access.paginate_resource(repo_cntrb_url): - cntrb_events = [] - for page_data, page in GithubPaginator(repo_cntrb_url, manifest.key_auth, logger).iter_pages(): + cntrb_events.append(event) - if page_data: - cntrb_events += page_data - - oldest_event_on_page = datetime.strptime(page_data[-1]["created_at"], "%Y-%m-%dT%H:%M:%SZ") - if oldest_event_on_page < newest_event_in_db: - print("Found cntrb events we already have...skipping the rest") - break + event_age = datetime.strptime(event["created_at"], "%Y-%m-%dT%H:%M:%SZ") + if event_age < newest_event_in_db: + logger.info("Found cntrb events we already have...skipping the rest") + break if len(cntrb_events) == 0: logger.info("There are no cntrb events, or new events for this user.\n") continue - events = process_contributor_events(cntrb, cntrb_events, logger, tool_source, tool_version, data_source) + except UrlNotFoundException as e: + logger.warning(e) + continue + + events = process_contributor_events(cntrb, cntrb_events, logger, tool_source, tool_version, data_source) - logger.info(f"Inserting {len(events)} events") - natural_keys = ["event_id", "tool_version"] - manifest.augur_db.insert_data(events, ContributorRepo, natural_keys) + logger.info(f"Inserting {len(events)} events") + natural_keys = ["event_id", "tool_version"] + bulk_insert_dicts(logger, events, ContributorRepo, natural_keys) def process_contributor_events(cntrb, cntrb_events, logger, tool_source, tool_version, data_source): diff --git a/augur/tasks/data_analysis/contributor_breadth_worker/setup.py b/augur/tasks/data_analysis/contributor_breadth_worker/setup.py deleted file mode 100644 index 86052e164c..0000000000 --- a/augur/tasks/data_analysis/contributor_breadth_worker/setup.py +++ /dev/null @@ -1,44 +0,0 @@ -#SPDX-License-Identifier: MIT -import io -import os -import re - -from setuptools import find_packages -from setuptools import setup - -def read(filename): - filename = os.path.join(os.path.dirname(__file__), filename) - text_type = type(u"") - with io.open(filename, mode="r", encoding='utf-8') as fd: - return re.sub(text_type(r':[a-z]+:`~?(.*?)`'), text_type(r'``\1``'), fd.read()) - -setup( - name="contributor_breadth_worker", - version="0.0.0", - url="https://github.com/chaoss/augur", - license='MIT', - author="AugurLabs", - author_email="gabe@gabehe.im", - description="Augur worker that collects the repos people have contirbuted to", - packages=find_packages(), - install_requires=[ - 'Flask==2.0.2', - 'Flask-Cors==3.0.10', - 'Flask-Login==0.5.0', - 'Flask-WTF==1.0.0', - 'requests==2.28.0', - 'psycopg2-binary==2.9.3' - ], - entry_points={ - 'console_scripts': [ - 'contributor_breadth_worker_start=workers.contributor_breadth_worker.runtime:main', - ], - }, - classifiers=[ - 'Development Status :: 2 - Pre-Alpha', - 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.7', - ] -) diff --git a/augur/tasks/data_analysis/discourse_analysis/setup.py b/augur/tasks/data_analysis/discourse_analysis/setup.py deleted file mode 100644 index 37d6557ec5..0000000000 --- a/augur/tasks/data_analysis/discourse_analysis/setup.py +++ /dev/null @@ -1,52 +0,0 @@ -import io -import os -import re - -from setuptools import find_packages -from setuptools import setup - -def read(filename): - filename = os.path.join(os.path.dirname(__file__), filename) - text_type = type(u"") - with io.open(filename, mode="r", encoding='utf-8') as fd: - return re.sub(text_type(r':[a-z]+:`~?(.*?)`'), text_type(r'``\1``'), fd.read()) - -setup( - name="discourse_analysis_worker", - version="0.1.0", - url="https://github.com/chaoss/augur", - license='MIT', - author="Augur Team", - author_email="s@goggins.com", - description="Worker to classify messages into discourse acts", - packages=find_packages(), - install_requires=[ - 'Flask==2.0.2', - 'Flask-Cors==3.0.10', - 'Flask-Login==0.5.0', - 'Flask-WTF==1.0.0', - 'requests==2.28.0', - 'psycopg2-binary==2.9.3', - 'click==8.0.3', - 'scipy>=1.10.0', - 'nltk==3.6.6', - 'pandas==1.5.3', - 'scikit-learn==1.1.3', - 'textblob==0.15.3', - 'python-crfsuite>=0.9.8', - 'sklearn-crfsuite>=0.3.6', - 'tabulate==0.8.9' - ], # python-crfsuite-0.9.8 sklearn-crfsuite-0.3.6 tabulate-0.8.9 - entry_points={ - 'console_scripts': [ - 'discourse_analysis_worker_start=workers.discourse_analysis_worker.runtime:main', - ], - }, - classifiers=[ - 'Development Status :: 2 - Pre-Alpha', - 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.7', - ] -) diff --git a/augur/tasks/data_analysis/discourse_analysis/tasks.py b/augur/tasks/data_analysis/discourse_analysis/tasks.py index 450ec15a29..e78e030e66 100644 --- a/augur/tasks/data_analysis/discourse_analysis/tasks.py +++ b/augur/tasks/data_analysis/discourse_analysis/tasks.py @@ -8,7 +8,7 @@ from collections import Counter from augur.tasks.init.celery_app import celery_app as celery -from augur.application.db.session import DatabaseSession +from augur.application.db.lib import get_session, get_repo_by_repo_git from augur.application.db.models import Repo, DiscourseInsight from augur.application.db.util import execute_session_query from augur.tasks.init.celery_app import AugurMlRepoCollectionTask @@ -47,10 +47,7 @@ def discourse_analysis_model(repo_git: str,logger,engine) -> None: tool_version = '0.1.0' data_source = 'Analysis of Issue/PR Messages' - with DatabaseSession(logger, engine) as session: - - query = session.query(Repo).filter(Repo.repo_git == repo_git) - repo_id = execute_session_query(query, 'one').repo_id + repo_id = get_repo_by_repo_git(repo_git).repo_id get_messages_for_repo_sql = s.sql.text(""" (SELECT r.repo_group_id, r.repo_id, r.repo_git, r.repo_name, i.issue_id thread_id,m.msg_text,i.issue_title thread_title,m.msg_id @@ -96,7 +93,7 @@ def discourse_analysis_model(repo_git: str,logger,engine) -> None: logger.debug(f"y_pred_git_flat len: {len(y_pred_git_flat)}") msg_df_cur_repo['discourse_act'] = y_pred_git_flat - with DatabaseSession(logger, engine) as session: + with get_session() as session: for index, row in msg_df_cur_repo.iterrows(): record = { 'msg_id': row['msg_id'], diff --git a/augur/tasks/data_analysis/insight_worker/setup.py b/augur/tasks/data_analysis/insight_worker/setup.py deleted file mode 100644 index 1ee6e8a4bd..0000000000 --- a/augur/tasks/data_analysis/insight_worker/setup.py +++ /dev/null @@ -1,48 +0,0 @@ -#SPDX-License-Identifier: MIT -import io -import os -import re - -from setuptools import find_packages -from setuptools import setup - -def read(filename): - filename = os.path.join(os.path.dirname(__file__), filename) - text_type = type(u"") - with io.open(filename, mode="r", encoding='utf-8') as fd: - return re.sub(text_type(r':[a-z]+:`~?(.*?)`'), text_type(r'``\1``'), fd.read()) - -setup( - name="insight_worker", - version="1.0.0", - url="https://github.com/chaoss/augur", - license='MIT', - author="Augurlabs", - author_email="s@goggins.com", - description="Augur Worker that discovers and stores data anomalies", - packages=find_packages(exclude=('tests',)), - install_requires=[ - 'Flask==2.0.2', - 'Flask-Cors==3.0.10', - 'Flask-Login==0.5.0', - 'Flask-WTF==1.0.0', - 'requests==2.28.0', - 'psycopg2-binary==2.9.3', - 'click==8.0.3', - 'scipy>=1.10.0', - 'sklearn==0.0', - 'numpy==1.26.0', - ], - entry_points={ - 'console_scripts': [ - 'insight_worker_start=workers.insight_worker.runtime:main', - ], - }, - classifiers=[ - 'Development Status :: 2 - Pre-Alpha', - 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.7', - ] -) diff --git a/augur/tasks/data_analysis/insight_worker/tasks.py b/augur/tasks/data_analysis/insight_worker/tasks.py index 5ec7d942ed..97a6580d6f 100644 --- a/augur/tasks/data_analysis/insight_worker/tasks.py +++ b/augur/tasks/data_analysis/insight_worker/tasks.py @@ -10,10 +10,8 @@ import warnings from augur.tasks.init.celery_app import celery_app as celery -from augur.application.db.session import DatabaseSession -from augur.application.db.lib import get_value -from augur.application.db.models import Repo, ChaossMetricStatus, RepoInsight, RepoInsightsRecord -from augur.application.db.util import execute_session_query +from augur.application.db.lib import get_value, get_repo_by_repo_git, get_session +from augur.application.db.models import ChaossMetricStatus, RepoInsight, RepoInsightsRecord from augur.tasks.init.celery_app import AugurMlRepoCollectionTask warnings.filterwarnings('ignore') @@ -25,11 +23,10 @@ def insight_task(self, repo_git): logger = logging.getLogger(insight_task.__name__) engine = self.app.engine - with DatabaseSession(logger, engine) as session: - insight_model(repo_git, logger, engine, session) + insight_model(repo_git, logger, engine) -def insight_model(repo_git: str,logger,engine,session) -> None: +def insight_model(repo_git: str,logger,engine) -> None: refresh = True send_insights = True @@ -40,8 +37,8 @@ def insight_model(repo_git: str,logger,engine,session) -> None: metrics = {"issues-new": "issues", "code-changes": "commit_count", "code-changes-lines": "added", "reviews": "pull_requests", "contributors-new": "new_contributors"} - query = session.query(Repo).filter(Repo.repo_git == repo_git) - repo_id = execute_session_query(query, 'one').repo_id + repo = get_repo_by_repo_git(repo_git) + repo_id = repo.repo_id anomaly_days = get_value('Insight_Task', 'anomaly_days') training_days = get_value('Insight_Task', 'training_days') @@ -110,7 +107,7 @@ def insight_model(repo_git: str,logger,engine,session) -> None: """) with engine.connect() as conn: - result = conn.execute(delete_record_SQL, repo_id=repo_id, min_date=min_date) + result = conn.execute(delete_record_SQL, parameters=dict(repo_id=repo_id, min_date=min_date)) logger.info("Deleting out of date data points ...\n") delete_points_SQL = s.sql.text(""" @@ -132,7 +129,7 @@ def insight_model(repo_git: str,logger,engine,session) -> None: """) with engine.connect() as conn: - result = conn.execute(delete_points_SQL, repo_id=repo_id, min_date=min_date) + result = conn.execute(delete_points_SQL, parameters=dict(repo_id=repo_id, min_date=min_date)) # get table values to check for dupes later on @@ -247,7 +244,7 @@ def classify_anomalies(df, metric): "data_source": data_source } - with DatabaseSession(logger, engine) as session: + with get_session() as session: repo_insight_record_obj = RepoInsightsRecord(**record) session.add(repo_insight_record_obj) session.commit() @@ -292,7 +289,7 @@ def classify_anomalies(df, metric): "data_source": data_source } - with DatabaseSession(logger, engine) as session: + with get_session() as session: repo_insight_obj = RepoInsight(**data_point) session.add(repo_insight_obj) session.commit() diff --git a/augur/tasks/data_analysis/message_insights/setup.py b/augur/tasks/data_analysis/message_insights/setup.py deleted file mode 100644 index a4f6a30c43..0000000000 --- a/augur/tasks/data_analysis/message_insights/setup.py +++ /dev/null @@ -1,57 +0,0 @@ -#SPDX-License-Identifier: MIT - -import io -import os -import re - -from setuptools import find_packages -from setuptools import setup - -def read(filename): - filename = os.path.join(os.path.dirname(__file__), filename) - text_type = type(u"") - with io.open(filename, mode="r", encoding='utf-8') as fd: - return re.sub(text_type(r':[a-z]+:`~?(.*?)`'), text_type(r'``\1``'), fd.read()) - -setup( - name="message_insights", - version="0.3.1", - url="https://github.com/chaoss/augur", - license='MIT', - author="Augur Team", - author_email="akshblr555@gmail.com", - description="Message Insights worker that detects novel messages & analyzes sentiment from issue, PR messages", - packages=find_packages(), - install_requires=[ - 'Flask==2.0.2', - 'Flask-Cors==3.0.10', - 'Flask-Login==0.5.0', - 'Flask-WTF==1.0.0', - 'requests==2.28.0', - 'psycopg2-binary==2.9.3', - 'click==8.0.3', - 'scipy>=1.10.0', - 'scikit-learn==1.1.3', #0.24.2', - 'numpy==1.26.0', - 'nltk==3.6.6', - 'pandas==1.5.3', - 'emoji==1.2.0', - 'keras>=2.15.0', - 'Keras-Preprocessing', - 'tensorflow==2.15.0', - 'h5py==3.10.0', - 'scikit-image==0.19.1', - 'joblib==1.2.0', - 'xgboost', - 'bs4==0.0.1', - 'xlrd==2.0.1', - 'gensim>=4.2.0' - ], - classifiers=[ - 'Development Status :: 3 - Alpha', - 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.7', - ] -) diff --git a/augur/tasks/data_analysis/message_insights/tasks.py b/augur/tasks/data_analysis/message_insights/tasks.py index 6cc0446ab8..fe12bb9606 100644 --- a/augur/tasks/data_analysis/message_insights/tasks.py +++ b/augur/tasks/data_analysis/message_insights/tasks.py @@ -12,10 +12,8 @@ from augur.tasks.data_analysis.message_insights.message_sentiment import get_senti_score from augur.tasks.init.celery_app import celery_app as celery -from augur.application.db.session import DatabaseSession -from augur.application.db.lib import get_value -from augur.application.db.models import Repo, MessageAnalysis, MessageAnalysisSummary -from augur.application.db.util import execute_session_query +from augur.application.db.lib import get_value, get_repo_by_repo_git, get_session +from augur.application.db.models import MessageAnalysis, MessageAnalysisSummary from augur.tasks.init.celery_app import AugurMlRepoCollectionTask #SPDX-License-Identifier: MIT @@ -28,12 +26,11 @@ def message_insight_task(self, repo_git): logger = logging.getLogger(message_insight_task.__name__) engine = self.app.engine - with DatabaseSession(logger, engine) as session: - message_insight_model(repo_git, logger, engine, session) + message_insight_model(repo_git, logger, engine) -def message_insight_model(repo_git: str,logger,engine, session) -> None: +def message_insight_model(repo_git: str,logger,engine) -> None: full_train = True begin_date = '' @@ -45,8 +42,8 @@ def message_insight_model(repo_git: str,logger,engine, session) -> None: now = datetime.datetime.utcnow() run_id = int(now.timestamp())+5 - query = session.query(Repo).filter(Repo.repo_git == repo_git) - repo_id = execute_session_query(query, 'one').repo_id + repo = get_repo_by_repo_git(repo_git) + repo_id = repo.repo_id models_dir = os.path.join(ROOT_AUGUR_DIRECTORY, "tasks", "data_analysis", "message_insights", get_value("Message_Insights", 'models_dir')) insight_days = get_value("Message_Insights", 'insight_days') @@ -193,32 +190,34 @@ def message_insight_model(repo_git: str,logger,engine, session) -> None: logger.info('Begin message_analysis data insertion...') logger.info(f'{df_message.shape[0]} data records to be inserted') - for row in df_message.itertuples(index=False): - try: - msg = { - "msg_id": row.msg_id, - "worker_run_id": run_id, - "sentiment_score": row.sentiment_score, - "reconstruction_error": row.rec_err, - "novelty_flag": row.novel_label, - "feedback_flag": None, - "tool_source": tool_source, - "tool_version": tool_version, - "data_source": data_source, - } - - message_analysis_object = MessageAnalysis(**msg) - session.add(message_analysis_object) - session.commit() - - # result = create_database_engine().execute(message_analysis_table.insert().values(msg)) - logger.info( - f'Primary key inserted into the message_analysis table: {message_analysis_object.msg_analysis_id}') - # logger.info( - # f'Inserted data point {results_counter} with msg_id {row.msg_id} and timestamp {row.msg_timestamp}') - except Exception as e: - logger.error(f'Error occurred while storing datapoint {repr(e)}') - break + with get_session() as session: + + for row in df_message.itertuples(index=False): + try: + msg = { + "msg_id": row.msg_id, + "worker_run_id": run_id, + "sentiment_score": row.sentiment_score, + "reconstruction_error": row.rec_err, + "novelty_flag": row.novel_label, + "feedback_flag": None, + "tool_source": tool_source, + "tool_version": tool_version, + "data_source": data_source, + } + + message_analysis_object = MessageAnalysis(**msg) + session.add(message_analysis_object) + session.commit() + + # result = create_database_engine().execute(message_analysis_table.insert().values(msg)) + logger.info( + f'Primary key inserted into the message_analysis table: {message_analysis_object.msg_analysis_id}') + # logger.info( + # f'Inserted data point {results_counter} with msg_id {row.msg_id} and timestamp {row.msg_timestamp}') + except Exception as e: + logger.error(f'Error occurred while storing datapoint {repr(e)}') + break logger.info('Data insertion completed\n') @@ -318,27 +317,30 @@ def message_insight_model(repo_git: str,logger,engine, session) -> None: # Insertion of sentiment ratios & novel counts to repo level table logger.info('Begin repo wise insights insertion...') logger.info(f'{df_senti.shape[0]} data records to be inserted\n') - for row in df_trend.itertuples(): - msg = { - "repo_id": repo_id, - "worker_run_id": run_id, - "positive_ratio": row.PosR, - "negative_ratio": row.NegR, - "novel_count": row.Novel, - "period": row.Index, - "tool_source": tool_source, - "tool_version": tool_version, - "data_source": data_source - } - - message_analysis_summary_object = MessageAnalysisSummary(**msg) - session.add(message_analysis_summary_object) - session.commit() - - # result = create_database_engine().execute(message_analysis_summary_table.insert().values(msg)) - logger.info( - f'Primary key inserted into the message_analysis_summary table: {message_analysis_summary_object.msg_summary_id}') - # logger.info(f'Inserted data point {results_counter} for insight_period {row.Index}') + + with get_session() as session: + + for row in df_trend.itertuples(): + msg = { + "repo_id": repo_id, + "worker_run_id": run_id, + "positive_ratio": row.PosR, + "negative_ratio": row.NegR, + "novel_count": row.Novel, + "period": row.Index, + "tool_source": tool_source, + "tool_version": tool_version, + "data_source": data_source + } + + message_analysis_summary_object = MessageAnalysisSummary(**msg) + session.add(message_analysis_summary_object) + session.commit() + + # result = create_database_engine().execute(message_analysis_summary_table.insert().values(msg)) + logger.info( + f'Primary key inserted into the message_analysis_summary table: {message_analysis_summary_object.msg_summary_id}') + # logger.info(f'Inserted data point {results_counter} for insight_period {row.Index}') logger.info('Data insertion completed\n') diff --git a/augur/tasks/data_analysis/pull_request_analysis_worker/setup.py b/augur/tasks/data_analysis/pull_request_analysis_worker/setup.py deleted file mode 100644 index 3341f24ff1..0000000000 --- a/augur/tasks/data_analysis/pull_request_analysis_worker/setup.py +++ /dev/null @@ -1,47 +0,0 @@ -import io -import os -import re - -from setuptools import find_packages -from setuptools import setup - -def read(filename): - filename = os.path.join(os.path.dirname(__file__), filename) - text_type = type(u"") - with io.open(filename, mode="r", encoding='utf-8') as fd: - return re.sub(text_type(r':[a-z]+:`~?(.*?)`'), text_type(r'``\1``'), fd.read()) - -setup( - name="pull_request_analysis_worker", - version="0.0.0", - url="https://github.com/chaoss/augur", - license='MIT', - author="Augur Team", - author_email="akshblr555@gmail.com", - description="Pull Request Analysis worker that predicts acceptance of a PR", - packages=find_packages(), - install_requires=[ - 'Flask==2.0.2', - 'Flask-Cors==3.0.10', - 'Flask-Login==0.5.0', - 'Flask-WTF==1.0.0', - 'requests==2.28.0', - 'psycopg2-binary==2.9.3', - 'sklearn==0.0', - 'nltk==3.6.6', - 'numpy==1.26.0', - 'pandas==1.5.3', - 'emoji==1.2.0', - 'joblib==1.2.0', - 'xgboost==1.4.2', - 'scipy>=1.10.0' - ], - classifiers=[ - 'Development Status :: 2 - Pre-Alpha', - 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.7', - ] - -) diff --git a/augur/tasks/data_analysis/pull_request_analysis_worker/tasks.py b/augur/tasks/data_analysis/pull_request_analysis_worker/tasks.py index af806bcdd1..2347eb109c 100644 --- a/augur/tasks/data_analysis/pull_request_analysis_worker/tasks.py +++ b/augur/tasks/data_analysis/pull_request_analysis_worker/tasks.py @@ -9,10 +9,8 @@ from augur.tasks.data_analysis.message_insights.message_sentiment import get_senti_score from augur.tasks.init.celery_app import celery_app as celery -from augur.application.db.session import DatabaseSession -from augur.application.db.lib import get_value -from augur.application.db.models import Repo, PullRequestAnalysis -from augur.application.db.util import execute_session_query +from augur.application.db.lib import get_value, get_session, get_repo_by_repo_git +from augur.application.db.models import PullRequestAnalysis from augur.tasks.init.celery_app import AugurMlRepoCollectionTask @@ -40,14 +38,11 @@ def pull_request_analysis_model(repo_git: str,logger,engine) -> None: insight_days = 200 - with DatabaseSession(logger, engine) as session: + repo_id = get_repo_by_repo_git(repo_git).repo_id - query = session.query(Repo).filter(Repo.repo_git == repo_git) - repo_id = execute_session_query(query, 'one').repo_id + senti_models_dir = os.path.join(ROOT_AUGUR_DIRECTORY, "tasks", "data_analysis", "message_insights", get_value("Message_Insights", 'models_dir')) - senti_models_dir = os.path.join(ROOT_AUGUR_DIRECTORY, "tasks", "data_analysis", "message_insights", get_value("Message_Insights", 'models_dir')) - - logger.info(f'Sentiment model dir located - {senti_models_dir}') + logger.info(f'Sentiment model dir located - {senti_models_dir}') # Any initial database instructions, like finding the last tuple inserted or generate the next ID value @@ -211,7 +206,7 @@ def pull_request_analysis_model(repo_git: str,logger,engine) -> None: logger.info('Begin PR_analysis data insertion...') logger.info(f'{df.shape[0]} data records to be inserted') - with DatabaseSession(logger, engine) as session: + with get_session() as session: for row in df.itertuples(index=False): try: msg = { diff --git a/augur/tasks/db/refresh_materialized_views.py b/augur/tasks/db/refresh_materialized_views.py index c191b56039..8a06ac7a61 100644 --- a/augur/tasks/db/refresh_materialized_views.py +++ b/augur/tasks/db/refresh_materialized_views.py @@ -3,7 +3,10 @@ import sqlalchemy as s from augur.tasks.init.celery_app import celery_app as celery -from augur.application.db.session import DatabaseSession +from augur.application.db.lib import execute_sql +from augur.tasks.git.util.facade_worker.facade_worker.config import FacadeHelper +from augur.tasks.git.util.facade_worker.facade_worker.rebuildcache import invalidate_caches, rebuild_unknown_affiliation_and_web_caches + @celery.task(bind=True) def refresh_materialized_views(self): @@ -86,96 +89,113 @@ def refresh_materialized_views(self): """) try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv1_refresh) + execute_sql(mv1_refresh) except Exception as e: logger.info(f"error is {e}") pass try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv2_refresh) + execute_sql(mv2_refresh) except Exception as e: logger.info(f"error is {e}") pass try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv3_refresh) + execute_sql(mv3_refresh) except Exception as e: logger.info(f"error is {e}") pass try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv4_refresh) + execute_sql(mv4_refresh) except Exception as e: logger.info(f"error is {e}") pass try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv5_refresh) + execute_sql(mv5_refresh) except Exception as e: logger.info(f"error is {e}") pass try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv6_refresh) + execute_sql(mv6_refresh) except Exception as e: logger.info(f"error is {e}") pass try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv7_refresh) + execute_sql(mv7_refresh) except Exception as e: logger.info(f"error is {e}") pass try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv8_refresh) + execute_sql(mv8_refresh) except Exception as e: logger.info(f"error is {e}") pass try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv9_refresh) + execute_sql(mv9_refresh) except Exception as e: logger.info(f"error is {e}") pass try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv10_refresh) + execute_sql(mv10_refresh) except Exception as e: logger.info(f"error is {e}") pass try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv11_refresh) + execute_sql(mv11_refresh) except Exception as e: logger.info(f"error is {e}") pass try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv12_refresh) + execute_sql(mv12_refresh) except Exception as e: logger.info(f"error is {e}") pass try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv13_refresh) + execute_sql(mv13_refresh) except Exception as e: logger.info(f"error is {e}") pass + #Now refresh facade tables + #Use this class to get all the settings and + #utility functions for facade + facade_helper = FacadeHelper(logger) + + if facade_helper.nuke_stored_affiliations: + logger.error("Nuke stored affiliations is deprecated!") + # deprecated because the UI component of facade where affiliations would be + # nuked upon change no longer exists, and this information can easily be derived + # from queries and materialized views in the current version of Augur. + # This method is also a major performance bottleneck with little value. + + if not facade_helper.limited_run or (facade_helper.limited_run and facade_helper.fix_affiliations): + logger.error("Fill empty affiliations is deprecated!") + # deprecated because the UI component of facade where affiliations would need + # to be fixed upon change no longer exists, and this information can easily be derived + # from queries and materialized views in the current version of Augur. + # This method is also a major performance bottleneck with little value. + + if facade_helper.force_invalidate_caches: + try: + invalidate_caches(facade_helper) + except Exception as e: + logger.info(f"error is {e}") + + if not facade_helper.limited_run or (facade_helper.limited_run and facade_helper.rebuild_caches): + try: + rebuild_unknown_affiliation_and_web_caches(facade_helper) + except Exception as e: + logger.info(f"error is {e}") diff --git a/augur/tasks/frontend.py b/augur/tasks/frontend.py index fffd79d330..d1a3918143 100644 --- a/augur/tasks/frontend.py +++ b/augur/tasks/frontend.py @@ -1,9 +1,19 @@ import logging import re +import sqlalchemy as s +import urllib.parse +from time import sleep + from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.github.util.github_task_session import GithubTaskSession -from augur.application.db.models import UserRepo, Repo, User +from augur.tasks.github.util.github_graphql_data_access import GithubGraphQlDataAccess +from augur.application.db.lib import get_group_by_name, get_repo_by_repo_git, get_github_repo_by_src_id, get_gitlab_repo_by_src_id +from augur.tasks.github.util.util import get_owner_repo +from augur.application.db.models.augur_operations import retrieve_owner_repos, FRONTEND_REPO_GROUP_NAME, RepoGroup, CollectionStatus +from augur.tasks.github.util.github_paginator import hit_api + +from augur.application.db.models import UserRepo, Repo def parse_org_name(string): @@ -15,84 +25,381 @@ def parse_org_and_repo_name(string): match = re.match(r'^\/?([a-zA-Z0-9_-]+)\/([a-zA-Z0-9_-]+)\/?$', string) return match - @celery.task -def add_org_repo_list(user_id, group_name, urls): +def add_github_orgs_and_repos(user_id, group_name, orgs, repo_urls): - logger = logging.getLogger(add_org_repo_list.__name__) + logger = logging.getLogger(add_github_orgs_and_repos.__name__) with GithubTaskSession(logger) as session: - user = User.get_by_id(session, user_id) - - invalid_urls = [] - valid_orgs = [] - valid_repos = [] - for url in urls: - - # matches https://github.com/{org}/ or http://github.com/{org} - if Repo.parse_github_org_url(url): - added = user.add_github_org(group_name, url)[0] - if added: - valid_orgs.append(url) - - # matches https://github.com/{org}/{repo}/ or http://github.com/{org}/{repo} - elif Repo.parse_github_repo_url(url)[0]: - added = user.add_github_repo(group_name, url)[0] - if added: - valid_repos.append(url) - - # matches /{org}/{repo}/ or /{org}/{repo} or {org}/{repo}/ or {org}/{repo} - elif (match := parse_org_and_repo_name(url)): - org, repo = match.groups() - repo_url = f"https://github.com/{org}/{repo}/" - added = user.add_github_repo(group_name, repo_url)[0] - if added: - valid_repos.append(url) - - # matches /{org}/ or /{org} or {org}/ or {org} - elif (match := parse_org_name(url)): - org = match.group(1) - org_url = f"https://github.com/{org}/" - added = user.add_github_org(group_name, org_url)[0] - if added: - valid_orgs.append(url) - - # matches https://gitlab.com/{org}/{repo}/ or http://gitlab.com/{org}/{repo} - elif Repo.parse_gitlab_repo_url(url)[0]: - - added = user.add_gitlab_repo(group_name, url)[0] - if added: - valid_repos.append(url) - - else: - invalid_urls.append(url) - - return valid_orgs, valid_repos, invalid_urls + # determine group id from name + group = get_group_by_name(user_id, group_name) + if not group: + logger.error(f"Error while adding repo. Invalid group name of {group_name}. Cannot insert repos") + return + + group_id = group.group_id + # get frontend repo group + frontend_repo_group = RepoGroup.get_by_name(session, FRONTEND_REPO_GROUP_NAME) + if not frontend_repo_group: + logger.error("Error while adding repo: Could not find frontend repo group so repos cannot be inserted") + return - + repo_group_id = frontend_repo_group.repo_group_id + + # define repo_data and assoicate repos with frontend repo group + repo_data = [(url, repo_group_id) for url in repo_urls] + + # get org repos and associate them with their org repo group + org_repo_data = get_org_repo_data(orgs, session) + repo_data.extend(org_repo_data) + + # break list of repos into lists of 100 so that graphql query isn't overwhelmed + for chunk in divide_list_into_chunks(repo_data, 100): + + add_new_github_repos(chunk, group_id, session, logger) -# TODO: Change to github specific @celery.task -def add_repo(user_id, group_name, repo_url): +def add_gitlab_repos(user_id, group_name, repo_urls): - logger = logging.getLogger(add_org.__name__) + logger = logging.getLogger(add_github_orgs_and_repos.__name__) with GithubTaskSession(logger) as session: - result = UserRepo.add_github_repo(session, repo_url, user_id, group_name) + + # determine group id from name + group = get_group_by_name(user_id, group_name) + if not group: + logger.error(f"Error while adding repo. Invalid group name of {group_name}. Cannot insert repos") + return + + group_id = group.group_id + + # get frontend repo group + frontend_repo_group = RepoGroup.get_by_name(session, FRONTEND_REPO_GROUP_NAME) + if not frontend_repo_group: + logger.error("Error while adding repo: Could not find frontend repo group so repos cannot be inserted") + return + + repo_group_id = frontend_repo_group.repo_group_id + + for url in repo_urls: + + result = get_gitlab_repo_data(session, url, logger) + if not result: + continue + + if "id" not in result: + logger.error(f"Gitlab repo data returned without id. Url: {url}. Data: {result}") + continue + + repo_src_id = result["id"] + + existing_repo = get_gitlab_repo_by_src_id(repo_src_id) + if existing_repo: + + if existing_repo.repo_group_id != repo_group_id: + update_existing_repos_repo_group_id(session, existing_repo.repo_id, repo_group_id) + + add_existing_repo_to_group(logger, session, group_id, existing_repo.repo_id) + continue + + existing_repo = get_repo_by_repo_git(session, url) + if existing_repo: + + if existing_repo.repo_group_id != repo_group_id: + update_existing_repos_repo_group_id(session, existing_repo.repo_id, repo_group_id) + + # TODO: add logic to update the existing records repo_group_id if it isn't equal to the existing record + add_existing_repo_to_group(logger, session, group_id, existing_repo.repo_id) + continue + + add_gitlab_repo(logger, session, url, repo_group_id, group_id, repo_src_id) + + +def add_gitlab_repo(session, url, repo_group_id, group_id): + + repo_id = Repo.insert_gitlab_repo(session, url, repo_group_id, "Frontend") + if not repo_id: + return False, {"status": "Repo insertion failed", "repo_url": url} + + result = UserRepo.insert(session, repo_id, group_id) + if not result: + return False, {"status": "repo_user insertion failed", "repo_url": url} + + +def get_org_repo_data(orgs, session): + + repo_data = [] + for org in orgs: + + # create repo group for org if it doesn't exist + repo_group = RepoGroup.get_by_name(session, org) + if not repo_group: + repo_group = create_repo_group(session, org) + + # retrieve repo urls for org + org_repos, _ = retrieve_owner_repos(session, org) + if not org_repos: + continue + + # define urls and repo_group_id of org and then add to repo_data + org_repo_data = [(url, repo_group.repo_group_id) for url in org_repos] + repo_data.extend(org_repo_data) + + return repo_data + +# TODO: Do we need to check if the repo already exists in the user group? +def add_new_github_repos(repo_data, group_id, session, logger): + + # get data for repos to determine type, src id, and if they exist + data = get_github_repos_data(repo_data, session, logger) + + for url, repo_group_id in repo_data: + + repo_data = data[url] + if not repo_data: + # skip since cause the repo is not valid (doesn't exist likely) + continue + + repo_src_id = repo_data["databaseId"] + repo_type = repo_data["owner"]["__typename"] + + existing_repo = get_github_repo_by_src_id(repo_src_id) + if existing_repo: + + if existing_repo.repo_group_id != repo_group_id: + update_existing_repos_repo_group_id(session, existing_repo.repo_id, repo_group_id) + + add_existing_repo_to_group(logger, session, group_id, existing_repo.repo_id) + continue + + existing_repo = get_repo_by_repo_git(session, url) + if existing_repo: + + if existing_repo.repo_group_id != repo_group_id: + update_existing_repos_repo_group_id(session, existing_repo.repo_id, repo_group_id) + + add_existing_repo_to_group(logger, session, group_id, existing_repo.repo_id) + continue + + add_github_repo(logger, session, url, repo_group_id, group_id, repo_type, repo_src_id) + + +def add_existing_repo_to_group(logger, session, group_id, repo_id): + + UserRepo.insert(session, repo_id, group_id) + + +def divide_list_into_chunks(data, size): + + for i in range(0, len(data), size): + yield data[i:i + size] + + +# TODO: Make it only get like 100 at a time +def get_github_repos_data(repo_data, session, logger): + + repo_urls = [x[0] for x in repo_data] + + github_graphql_data_access = GithubGraphQlDataAccess(session.oauths, logger, ingore_not_found_error=True) + + query_parts = [] + repo_map = {} + for i, url in enumerate(repo_urls): + owner, repo = get_owner_repo(url) + query_parts.append(f"""repo_{i}: repository(owner: "{owner}", name: "{repo}") {{ + databaseId, owner {{ __typename }} + }}""") + + query = f"query GetRepoIds {{ {' '.join(query_parts)}}}" + + data = github_graphql_data_access.get_resource(query, {}, []) + + result_data = {} + for i, url in enumerate(repo_urls): + result_data[url] = data[f"repo_{i}"] + + return result_data + +def get_repo_by_repo_git(session, url): - print(repo_url, result) + return session.query(Repo).filter(Repo.repo_git == url).first() + +def create_repo_group(session, owner): + + repo_group = RepoGroup(rg_name=owner.lower(), rg_description="", rg_website="", rg_recache=0, rg_type="Unknown", + tool_source="Loaded by user", tool_version="1.0", data_source="Git") + session.add(repo_group) + session.commit() + + return repo_group + +def add_github_repo(logger, session, url, repo_group_id, group_id, repo_type, repo_src_id): + + # These two things really need to be done in one commit in the future to prevent one existing without the other + repo_id = Repo.insert_github_repo(session, url, repo_group_id, "Frontend", repo_type, repo_src_id) + if not repo_id: + logger.error("Error while adding repo: Failed to insert github repo") + return + + result = UserRepo.insert(session, repo_id, group_id) + if not result: + logger.error(f"Error while adding repo: Failed to insert user repo record. A record with a repo_id of {repo_id} and a group id of {group_id} needs to be added to the user repo table so that this repo shows up in the users group") + return + + CollectionStatus.insert(session, logger, repo_id) + + +def get_gitlab_repo_data(gl_session, url: str, logger) -> bool: + + REPO_ENDPOINT = "https://gitlab.com/api/v4/projects/{}/" + + owner, repo = Repo.parse_gitlab_repo_url(url) + if not owner or not repo: + logger.error(f"Tried to get gitlab repo data for invalid url: {url}") + return None + + # Encode namespace and project name for the API request + project_identifier = urllib.parse.quote(f"{owner}/{repo}", safe='') + url = REPO_ENDPOINT.format(project_identifier) + + attempts = 0 + while attempts < 10: + response = hit_api(gl_session.oauths, url, logger) + + if wait_in_seconds := response.headers.get("Retry-After") is not None: + sleep(int(wait_in_seconds)) + + if response.status_code == 404: + return None + + if response.status_code == 200: + return response.json() + + attempts += 1 + sleep(attempts*3) + + logger.error(f"Failed to get gitlab repo data after multiple attemps. Url: {url}") + + return None + +def add_gitlab_repo(logger, session, url, repo_group_id, group_id, repo_src_id): + + # These two things really need to be done in one commit in the future to prevent one existing without the other + repo_id = Repo.insert_gitlab_repo(session, url, repo_group_id, "Frontend", repo_src_id) + if not repo_id: + logger.error("Error while adding repo: Failed to insert github repo") + return + + result = UserRepo.insert(session, repo_id, group_id) + if not result: + logger.error(f"Error while adding repo: Failed to insert user repo record. A record with a repo_id of {repo_id} and a group id of {group_id} needs to be added to the user repo table so that this repo shows up in the users group") + return + + CollectionStatus.insert(session, logger, repo_id) + +def update_existing_repos_repo_group_id(session, repo_id, new_repo_group_id): + + # NOTE: It is safe to update the repos repo group id here because we know it will always be updating to an org repo group id. We don't want this behavior from the command line though, because a user adding a repo to a repo group could remove it from it's org repo group + update_stmt = ( + s.update(Repo) + .where(Repo.repo_id == repo_id) + .values(repo_group_id=new_repo_group_id) + ) + session.execute(update_stmt) + session.commit() + +# @celery.task +# def add_org_repo_list(user_id, group_name, urls): + +# logger = logging.getLogger(add_org_repo_list.__name__) + +# with GithubTaskSession(logger) as session: + +# user = User.get_by_id(session, user_id) + +# invalid_urls = [] +# valid_orgs = [] +# valid_repos = [] +# for url in urls: + +# # matches https://github.com/{org}/ or http://github.com/{org} +# if Repo.parse_github_org_url(url): +# added = user.add_github_org(group_name, url)[0] +# if added: +# valid_orgs.append(url) + +# # matches https://github.com/{org}/{repo}/ or http://github.com/{org}/{repo} +# elif Repo.parse_github_repo_url(url)[0]: +# added = user.add_github_repo(group_name, url)[0] +# if added: +# valid_repos.append(url) + +# # matches /{org}/{repo}/ or /{org}/{repo} or {org}/{repo}/ or {org}/{repo} +# elif (match := parse_org_and_repo_name(url)): +# org, repo = match.groups() +# repo_url = f"https://github.com/{org}/{repo}/" +# added = user.add_github_repo(group_name, repo_url)[0] +# if added: +# valid_repos.append(url) + +# # matches /{org}/ or /{org} or {org}/ or {org} +# elif (match := parse_org_name(url)): +# org = match.group(1) +# org_url = f"https://github.com/{org}/" +# added = user.add_github_org(group_name, org_url)[0] +# if added: +# valid_orgs.append(url) + +# # matches https://gitlab.com/{org}/{repo}/ or http://gitlab.com/{org}/{repo} +# elif Repo.parse_gitlab_repo_url(url)[0]: + +# added = user.add_gitlab_repo(group_name, url)[0] +# if added: +# valid_repos.append(url) + +# else: +# invalid_urls.append(url) + +# return valid_orgs, valid_repos, invalid_urls + + + # TODO: Change to github specific -@celery.task -def add_org(user_id, group_name, org_url): +# @celery.task +# def add_repo(user_id, group_name, repo_url): + +# logger = logging.getLogger(add_org.__name__) + +# with GithubTaskSession(logger) as session: +# result = UserRepo.add_github_repo(session, repo_url, user_id, group_name) + +# print(repo_url, result) + + +# # TODO: Change to github specific +# @celery.task +# def add_org(user_id, group_name, org_url): + +# logger = logging.getLogger(add_org.__name__) + +# with GithubTaskSession(logger) as session: +# result = UserRepo.add_github_org_repos(session, org_url, user_id, group_name) + +# print(org_url, result) + + + + + + + + + - logger = logging.getLogger(add_org.__name__) - with GithubTaskSession(logger) as session: - result = UserRepo.add_github_org_repos(session, org_url, user_id, group_name) - print(org_url, result) diff --git a/augur/tasks/git/dependency_libyear_tasks/core.py b/augur/tasks/git/dependency_libyear_tasks/core.py index 9e48757d61..b892570ad2 100644 --- a/augur/tasks/git/dependency_libyear_tasks/core.py +++ b/augur/tasks/git/dependency_libyear_tasks/core.py @@ -1,43 +1,39 @@ from datetime import datetime from augur.application.db.models import * -from augur.application.db.lib import get_value -from augur.application.db.util import execute_session_query +from augur.application.db.lib import get_value, bulk_insert_dicts, get_repo_by_repo_git from augur.tasks.git.dependency_libyear_tasks.libyear_util.util import get_deps_libyear_data from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path -def deps_libyear_model( session, repo_id,repo_git,repo_group_id): +def deps_libyear_model(logger,repo_git): """ Data collection and storage method """ - session.logger.info(f"This is the libyear deps model repo: {repo_git}") + logger.info(f"This is the libyear deps model repo: {repo_git}") #result = re.search(r"https:\/\/(github\.com\/[A-Za-z0-9 \- _]+\/)([A-Za-z0-9 \- _ .]+)$", repo_git).groups() #relative_repo_path = f"{repo_group_id}/{result[0]}{result[1]}" - query = session.query(Repo).filter( - Repo.repo_git == repo_git) - - result = execute_session_query(query, 'one') + + repo = get_repo_by_repo_git(repo_git) - absolute_repo_path = get_absolute_repo_path(get_value("Facade", "repo_directory"),repo_id,result.repo_path,result.repo_name) + absolute_repo_path = get_absolute_repo_path(get_value("Facade", "repo_directory"),repo.repo_id,repo.repo_path,repo.repo_name) #config.get_section("Facade")['repo_directory'] + relative_repo_path#self.config['repo_directory'] + relative_repo_path - generate_deps_libyear_data(session,repo_id, absolute_repo_path) + generate_deps_libyear_data(logger, repo.repo_id, absolute_repo_path) -def generate_deps_libyear_data(session, repo_id, path): +def generate_deps_libyear_data(logger, repo_id, path): """Scans for package files and calculates libyear - :param session: Task manifest and database session. :param repo_id: Repository ID :param path: Absolute path of the Repostiory """ date_scanned = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') - session.logger.info('Searching for deps in repo') - session.logger.info(f'Repo ID: {repo_id}, Path: {path}') + logger.info('Searching for deps in repo') + logger.info(f'Repo ID: {repo_id}, Path: {path}') - deps = get_deps_libyear_data(path,session.logger) + deps = get_deps_libyear_data(path,logger) if not deps: - session.logger.info(f"No deps found for repo {repo_id} on path {path}") + logger.info(f"No deps found for repo {repo_id} on path {path}") return to_insert = [] @@ -66,6 +62,6 @@ def generate_deps_libyear_data(session, repo_id, path): # VALUES (:repo_id, :name,:requirement,:type,:package_manager,:current_verion,:latest_version,:current_release_date,:latest_release_date,:libyear,:tool_source,:tool_version,:data_source, :data_collection_date) #""").bindparams(**repo_deps) # - #session.execute_sql(insert_statement) to_insert.append(repo_deps) - session.insert_data(to_insert, RepoDepsLibyear, ["repo_id","name","data_collection_date"]) + + bulk_insert_dicts(logger, to_insert, RepoDepsLibyear, ["repo_id","name","data_collection_date"]) diff --git a/augur/tasks/git/dependency_libyear_tasks/libyear_util/npm_libyear_utils.py b/augur/tasks/git/dependency_libyear_tasks/libyear_util/npm_libyear_utils.py index bcfe810a9c..acd73e424a 100644 --- a/augur/tasks/git/dependency_libyear_tasks/libyear_util/npm_libyear_utils.py +++ b/augur/tasks/git/dependency_libyear_tasks/libyear_util/npm_libyear_utils.py @@ -1,84 +1,89 @@ import requests +import logging +import traceback + +logger = logging.getLogger(__name__) def get_NPM_data(package): url = "https://registry.npmjs.org/%s" % package r = requests.get(url) if r.status_code < 400: return r.json() + logger.warning(f"Failed to fetch data for package {package}. HTTP Status: {r.status_code}") return {} - def clean_version(version): version = [v for v in version if v.isdigit() or v == '.'] return ''.join(version) def split_version(version): - #Split version string into list seperated by . - #assign elements of list to respective variables. version_list = list(version.split('.')) patch = version_list.pop(-1) minor = version_list.pop(-1) major = version_list[0] - - return major,minor,patch - - + return major, minor, patch def get_latest_patch(version, data): + if 'versions' not in data: + logger.error(f"'versions' key not found in the NPM data for version {version}. Data: {data}") + raise KeyError("'versions' key not found") + versions = data['versions'] try: index = list(versions.keys()).index(version) except ValueError as e: + logger.error(f"Version {version} not found in the 'versions' list. Error: {e}") raise e - major,minor,patch = split_version(version) + major, minor, patch = split_version(version) consider_version = version for v in list(versions.keys())[index:]: - if v.split('.')[0]==major: - if v.split('.')[1]== minor: - if v.split('.')[2]>patch: + if v.split('.')[0] == major: + if v.split('.')[1] == minor: + if v.split('.')[2] > patch: consider_version = v return consider_version - def get_lastest_minor(version, data): + if 'versions' not in data: + logger.error(f"'versions' key not found in the NPM data. Data: {data}") + raise KeyError("'versions' key not found") + versions = data['versions'] try: index = list(versions.keys()).index(version) except ValueError as e: + logger.info(f"Version {version} not found in the 'versions' list. Error: {e}") raise e - major,minor,patch = split_version(version) - + major, minor, patch = split_version(version) consider_version = get_latest_patch(version, data) for v in list(versions.keys())[index:]: - if v.split('.')[0]==major: - if v.split('.')[1]>minor: - consider_version = v - return consider_version - + if v.split('.')[0] == major: + if v.split('.')[1] > minor: + consider_version = v + return consider_version def get_npm_release_date(data, version): - release_time = data['time'][version] + release_time = data['time'].get(version) if release_time: return release_time + logger.warning(f"Release time not found for version {version}") return None - def get_npm_latest_version(data): - return data['dist-tags']['latest'] + return data['dist-tags'].get('latest', 'unknown') -#add code here def get_npm_current_version(data, requirement): - if requirement[0]=='~': + if requirement[0] == '~': try: return get_latest_patch(clean_version(requirement), data) except ValueError: return None - elif requirement[0]=='^': + elif requirement[0] == '^': try: return get_lastest_minor(clean_version(requirement), data) except ValueError: return None else: - return requirement + return requirement \ No newline at end of file diff --git a/augur/tasks/git/dependency_libyear_tasks/libyear_util/pypi_parser.py b/augur/tasks/git/dependency_libyear_tasks/libyear_util/pypi_parser.py index 7aaaf1f190..1824322f48 100644 --- a/augur/tasks/git/dependency_libyear_tasks/libyear_util/pypi_parser.py +++ b/augur/tasks/git/dependency_libyear_tasks/libyear_util/pypi_parser.py @@ -140,12 +140,12 @@ def parse_poetry_lock(file_handle): group = 'runtime' for package in manifest['package']: req = None - if package['category'] == 'main': + if package.get('category') == 'main': group = 'runtime' - if package['category'] == 'dev': + if package.get('category') == 'dev': group = 'develop' if 'version' in package: - req = package['version'] + req = package.get('version') elif 'git' in package: req = package['git']+'#'+package['ref'] Dict = {'name': package['name'], 'requirement': req, 'type': group, 'package': 'PYPI'} @@ -160,7 +160,14 @@ def parse_conda(file_handle): pip = None if not contents: return [] - dependencies = contents['dependencies'] + #dependencies = contents['dependencies'] + dependencies = contents.get('dependencies', []) + + if not dependencies: + print("No dependencies found.") + return [] + else: + print("Dependencies found.") for dep in dependencies: if (type(dep) is dict) and dep['pip']: pip = dep diff --git a/augur/tasks/git/dependency_libyear_tasks/libyear_util/util.py b/augur/tasks/git/dependency_libyear_tasks/libyear_util/util.py index 111d3fc631..ffa2d4a84a 100644 --- a/augur/tasks/git/dependency_libyear_tasks/libyear_util/util.py +++ b/augur/tasks/git/dependency_libyear_tasks/libyear_util/util.py @@ -32,54 +32,56 @@ def get_parsed_deps(path,logger): deps_file = None dependency_list = list() - for f in file_list: deps_file = find(f, path) - if not deps_file: + + if not deps_file or not f: continue file_handle= open(deps_file) - if f == 'Requirement.txt': - dependency_list = parse_requirement_txt(file_handle) + short_file_name = os.path.split(deps_file)[-1] + + if short_file_name == 'Requirement.txt': + dependency_list.extend(parse_requirement_txt(file_handle)) - elif f == 'requirements.txt': - dependency_list = parse_requirement_txt(file_handle) + if short_file_name == 'requirements.txt': + dependency_list.extend(parse_requirement_txt(file_handle)) - elif f == 'setup.py': - dependency_list = parse_setup_py(file_handle) + if short_file_name == 'setup.py': + dependency_list.extend(parse_setup_py(file_handle)) - elif f == 'Pipfile': - dependency_list = parse_pipfile(file_handle) + if short_file_name == 'Pipfile': + dependency_list.extend(parse_pipfile(file_handle)) - elif f == 'Pipfile.lock': - dependency_list = parse_pipfile_lock(file_handle) + if short_file_name == 'Pipfile.lock': + dependency_list.extend(parse_pipfile_lock(file_handle)) - elif f == 'pyproject.toml': - dependency_list = parse_poetry(file_handle) + if short_file_name == 'pyproject.toml': + dependency_list.extend(parse_poetry(file_handle)) - elif f == 'poetry.lock': - dependency_list = parse_poetry_lock(file_handle) + if short_file_name == 'poetry.lock': + dependency_list.extend(parse_poetry_lock(file_handle)) - elif f == 'environment.yml': - dependency_list = parse_conda(file_handle) + if short_file_name == 'environment.yml': + dependency_list.extend(parse_conda(file_handle)) - elif f == 'environment.yaml': - dependency_list = parse_conda(file_handle) + if short_file_name == 'environment.yaml': + dependency_list.extend(parse_conda(file_handle)) - elif f == 'environment.yml.lock': - dependency_list = parse_conda(file_handle) + if f == 'environment.yml.lock': + dependency_list.extend(parse_conda(file_handle)) - elif f == 'environment.yaml.lock': - dependency_list = parse_conda(file_handle) + if short_file_name == 'environment.yaml.lock': + dependency_list.extend(parse_conda(file_handle)) - elif f == 'package.json': + if short_file_name == 'package.json': try: - dependency_list = parse_package_json(file_handle) + dependency_list.extend(parse_package_json(file_handle)) except KeyError as e: logger.error(f"package.json for repo at path {path} is missing required key: {e}\n Skipping file...") - return dependency_list + return dependency_list def get_libyear(current_version, current_release_date, latest_version, latest_release_date): diff --git a/augur/tasks/git/dependency_libyear_tasks/tasks.py b/augur/tasks/git/dependency_libyear_tasks/tasks.py index ff15c61d91..fbf121b2ac 100644 --- a/augur/tasks/git/dependency_libyear_tasks/tasks.py +++ b/augur/tasks/git/dependency_libyear_tasks/tasks.py @@ -1,22 +1,12 @@ import logging -from augur.application.db.session import DatabaseSession from augur.tasks.git.dependency_libyear_tasks.core import * from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurFacadeRepoCollectionTask -from augur.application.db.util import execute_session_query @celery.task(base=AugurFacadeRepoCollectionTask, bind=True) def process_libyear_dependency_metrics(self, repo_git): #raise NotImplementedError - engine = self.app.engine - logger = logging.getLogger(process_libyear_dependency_metrics.__name__) - with DatabaseSession(logger, engine) as session: - logger.info(f"repo_git: {repo_git}") - query = session.query(Repo).filter(Repo.repo_git == repo_git) - - - repo = execute_session_query(query,'one') - deps_libyear_model(session, repo.repo_id,repo_git,repo.repo_group_id) \ No newline at end of file + deps_libyear_model(logger, repo_git) \ No newline at end of file diff --git a/augur/tasks/git/dependency_tasks/core.py b/augur/tasks/git/dependency_tasks/core.py index 296e69075e..9262f241b4 100644 --- a/augur/tasks/git/dependency_tasks/core.py +++ b/augur/tasks/git/dependency_tasks/core.py @@ -1,23 +1,35 @@ from datetime import datetime import os from augur.application.db.models import * +from augur.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_value, get_session from augur.tasks.github.util.github_api_key_handler import GithubApiKeyHandler from augur.tasks.git.dependency_tasks.dependency_util import dependency_calculator as dep_calc from augur.tasks.util.worker_util import parse_json_from_subprocess_call +from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth +from augur.tasks.util.metadata_exception import MetadataException -def generate_deps_data(session, repo_id, path): + +def generate_deps_data(logger, repo_git): """Run dependency logic on repo and stores data in database :param repo_id: Repository ID :param path: Absolute path of the Repostiory """ - - scan_date = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') - session.logger.info('Searching for deps in repo') - session.logger.info(f'Repo ID: {repo_id}, Path: {path}, Scan date: {scan_date}') + logger.info(f"repo_git: {repo_git}") + repo = get_repo_by_repo_git(repo_git) + repo_id = repo.repo_id + + path = get_absolute_repo_path(get_value("Facade", "repo_directory"),repo.repo_id,repo.repo_path,repo.repo_name) - deps = dep_calc.get_deps(path,session.logger) + logger.debug(f"This is the deps model repo: {repo_git}.") + + scan_date = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') + logger.info('Searching for deps in repo') + logger.info(f'Repo ID: {repo_id}, Path: {path}, Scan date: {scan_date}') + + deps = dep_calc.get_deps(path,logger) to_insert = [] for dep in deps: @@ -33,85 +45,97 @@ def generate_deps_data(session, repo_id, path): } to_insert.append(repo_deps) - - session.insert_data(to_insert,RepoDependency,["repo_id","dep_name","data_collection_date"]) - session.logger.info(f"Inserted {len(deps)} dependencies for repo {repo_id}") + bulk_insert_dicts(logger, to_insert,RepoDependency,["repo_id","dep_name","data_collection_date"]) + + logger.info(f"Inserted {len(deps)} dependencies for repo {repo_id}") """ def deps_model(session, repo_id,repo_git,repo_path,repo_name): # Data collection and storage method - session.logger.info(f"This is the deps model repo: {repo_git}.") + logger.info(f"This is the deps model repo: {repo_git}.") generate_deps_data(session,repo_id, absolute_repo_path) """ -def generate_scorecard(session,repo_id,path): +def generate_scorecard(logger, repo_git): """Runs scorecard on repo and stores data in database :param repo_id: Repository ID - :param path: URL path of the Repostiory - """ - session.logger.info('Generating scorecard data for repo') - session.logger.info(f"Repo ID: {repo_id}, Path: {path}") + :param repo_git: URL path of the Repository + """ + repo = get_repo_by_repo_git(repo_git) + repo_id = repo.repo_id + logger.info('Generating scorecard data for repo') # we convert relative path in the format required by scorecard like github.com/chaoss/augur # raw_path,_ = path.split('-') # scorecard_repo_path = raw_path[2:] - path = path[8:] + path = repo_git[8:] if path[-4:] == '.git': path = path.replace(".git", "") - command = '--repo='+ path + command = '--local=' + path #this is path where our scorecard project is located - path_to_scorecard = os.environ['HOME'] + '/scorecard' + path_to_scorecard = os.getenv('SCORECARD_DIR', os.environ['HOME'] + '/scorecard') + + #setting the environmental variable which is required by scorecard + + with get_session() as session: + #key_handler = GithubRandomKeyAuth(logger) + key_handler = GithubApiKeyHandler(logger) + os.environ['GITHUB_AUTH_TOKEN'] = key_handler.get_random_key() + # This seems outdated #setting the environmental variable which is required by scorecard - key_handler = GithubApiKeyHandler(session, session.logger) - os.environ['GITHUB_AUTH_TOKEN'] = key_handler.get_random_key() + #key_handler = GithubApiKeyHandler(session, session.logger) + #os.environ['GITHUB_AUTH_TOKEN'] = key_handler.get_random_key() - required_output = parse_json_from_subprocess_call(session.logger,['./scorecard', command, '--format=json'],cwd=path_to_scorecard) + try: + required_output = parse_json_from_subprocess_call(logger,['./scorecard', command, '--format=json'],cwd=path_to_scorecard) - session.logger.info('adding to database...') - session.logger.debug(f"output: {required_output}") + logger.info('adding to database...') + logger.debug(f"output: {required_output}") - if not required_output['checks']: - session.logger.info('No scorecard checks found!') - return - - #Store the overall score first - to_insert = [] - overall_deps_scorecard = { - 'repo_id': repo_id, - 'name': 'OSSF_SCORECARD_AGGREGATE_SCORE', - 'scorecard_check_details': required_output['repo'], - 'score': required_output['score'], - 'tool_source': 'scorecard_model', - 'tool_version': '0.43.9', - 'data_source': 'Git', - 'data_collection_date': datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') - } - to_insert.append(overall_deps_scorecard) - # session.insert_data(overall_deps_scorecard, RepoDepsScorecard, ["repo_id","name"]) - - #Store misc data from scorecard in json field. - for check in required_output['checks']: - repo_deps_scorecard = { + if not required_output.get('checks'): + logger.info('No scorecard checks found!') + return + + #Store the overall score first + to_insert = [] + overall_deps_scorecard = { 'repo_id': repo_id, - 'name': check['name'], - 'scorecard_check_details': check, - 'score': check['score'], + 'name': 'OSSF_SCORECARD_AGGREGATE_SCORE', + 'scorecard_check_details': required_output['repo'], + 'score': required_output['score'], 'tool_source': 'scorecard_model', 'tool_version': '0.43.9', 'data_source': 'Git', 'data_collection_date': datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') } - to_insert.append(repo_deps_scorecard) - - session.insert_data(to_insert, RepoDepsScorecard, ["repo_id","name"]) - - session.logger.info(f"Done generating scorecard for repo {repo_id} from path {path}") + to_insert.append(overall_deps_scorecard) + # bulk_insert_dicts(overall_deps_scorecard, RepoDepsScorecard, ["repo_id","name"]) + #Store misc data from scorecard in json field. + for check in required_output['checks']: + repo_deps_scorecard = { + 'repo_id': repo_id, + 'name': check['name'], + 'scorecard_check_details': check, + 'score': check['score'], + 'tool_source': 'scorecard_model', + 'tool_version': '0.43.9', + 'data_source': 'Git', + 'data_collection_date': datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') + } + to_insert.append(repo_deps_scorecard) + + bulk_insert_dicts(logger, to_insert, RepoDepsScorecard, ["repo_id","name"]) + + logger.info(f"Done generating scorecard for repo {repo_id} from path {path}") + except Exception as e: + + raise MetadataException(e, f"required_output: {required_output}") diff --git a/augur/tasks/git/dependency_tasks/tasks.py b/augur/tasks/git/dependency_tasks/tasks.py index 152c053080..731c71d003 100644 --- a/augur/tasks/git/dependency_tasks/tasks.py +++ b/augur/tasks/git/dependency_tasks/tasks.py @@ -1,34 +1,17 @@ import logging import traceback -from augur.application.db.session import DatabaseSession from augur.tasks.git.dependency_tasks.core import * from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurFacadeRepoCollectionTask, AugurSecondaryRepoCollectionTask -from augur.application.db.util import execute_session_query -from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path -from augur.application.db.lib import get_value +from augur.tasks.util.metadata_exception import MetadataException -@celery.task(base=AugurFacadeRepoCollectionTask, bind=True) -def process_dependency_metrics(self, repo_git): - #raise NotImplementedError - - engine = self.app.engine +@celery.task(base=AugurFacadeRepoCollectionTask) +def process_dependency_metrics(repo_git): logger = logging.getLogger(process_dependency_metrics.__name__) - with DatabaseSession(logger, engine) as session: - logger.info(f"repo_git: {repo_git}") - query = session.query(Repo).filter(Repo.repo_git == repo_git) - - - repo = execute_session_query(query,'one') - - absolute_repo_path = get_absolute_repo_path(get_value("Facade", "repo_directory"),repo.repo_id,repo.repo_path,repo.repo_name) - - session.logger.debug(f"This is the deps model repo: {repo_git}.") - - generate_deps_data(session,repo.repo_id,absolute_repo_path) + generate_deps_data(logger, repo_git) @celery.task(base=AugurSecondaryRepoCollectionTask, bind=True) @@ -38,10 +21,26 @@ def process_ossf_dependency_metrics(self, repo_git): logger = logging.getLogger(process_ossf_dependency_metrics.__name__) - with DatabaseSession(logger, engine) as session: - logger.info(f"repo_git: {repo_git}") - - query = session.query(Repo).filter(Repo.repo_git == repo_git) + try: + generate_scorecard(logger, repo_git) + except Exception as e: + logger.warning(f'Exception generating scorecard: {e}') + tracer = ''.join(traceback.format_exception(type(e), e, e.__traceback__)) + logger.warning(f'Full stack trace of OpenSSF scorecard error: {tracer}') + raise MetadataException(e,f"An error occurred while generating the scorecard: {str(e)}") + + """ + This try/except block is an attempt to get more information about this occasional error: - repo = execute_session_query(query,'one') - generate_scorecard(session, repo.repo_id, repo_git) \ No newline at end of file + ```bash + Traceback (most recent call last): + File "/home/ubuntu/github/virtualenvs/hosted/lib/python3.11/site-packages/billiard/pool.py", line 366, in workloop + put((READY, (job, i, result, inqW_fd))) + File "/home/ubuntu/github/virtualenvs/hosted/lib/python3.11/site-packages/billiard/queues.py", line 366, in put + self.send_payload(ForkingPickler.dumps(obj)) + ^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/github/virtualenvs/hosted/lib/python3.11/site-packages/billiard/reduction.py", line 56, in dumps + cls(buf, protocol).dump(obj) + billiard.pool.MaybeEncodingError: Error sending result: ''(1, , None)''. Reason: ''PicklingError("Can\'t pickle : it\'s not the same object as augur.tasks.util.metadata_exception.MetadataException")''. + ``` + """ \ No newline at end of file diff --git a/augur/tasks/git/facade_tasks.py b/augur/tasks/git/facade_tasks.py index 9fdaba7664..26ba2f4b4f 100644 --- a/augur/tasks/git/facade_tasks.py +++ b/augur/tasks/git/facade_tasks.py @@ -1,19 +1,19 @@ #SPDX-License-Identifier: MIT import logging +import datetime from celery import group, chain -import sqlalchemy as s +from subprocess import check_output +from augur.application.db.lib import get_session, get_repo_by_repo_git, get_repo_by_repo_id, remove_working_commits_by_repo_id_and_hashes, get_working_commits_by_repo_id, facade_bulk_insert_commits, bulk_insert_dicts, get_missing_commit_message_hashes from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import trim_commits from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path, get_parent_commits_set, get_existing_commits_set from augur.tasks.git.util.facade_worker.facade_worker.analyzecommit import analyze_commit -from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_repo_commit_count, update_facade_scheduling_fields, get_facade_weight_with_commit_count, facade_bulk_insert_commits -from augur.tasks.git.util.facade_worker.facade_worker.rebuildcache import fill_empty_affiliations, invalidate_caches, nuke_affiliations, rebuild_unknown_affiliation_and_web_caches -from augur.tasks.git.util.facade_worker.facade_worker.postanalysiscleanup import git_repo_cleanup - +from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_repo_commit_count, update_facade_scheduling_fields, get_facade_weight_with_commit_count from augur.tasks.github.facade_github.tasks import * +from augur.tasks.git.util.facade_worker.facade_worker.config import FacadeHelper from augur.tasks.util.collection_state import CollectionState from augur.tasks.util.collection_util import get_collection_status_repo_git_from_filter from augur.tasks.git.util.facade_worker.facade_worker.repofetch import GitCloneError, git_repo_initialize, git_repo_updates @@ -24,7 +24,7 @@ from augur.tasks.init.celery_app import AugurFacadeRepoCollectionTask -from augur.application.db.models import Repo, CollectionStatus +from augur.application.db.models import Repo, CollectionStatus, CommitMessage from augur.tasks.git.dependency_tasks.tasks import process_dependency_metrics from augur.tasks.git.dependency_libyear_tasks.tasks import process_libyear_dependency_metrics @@ -58,13 +58,10 @@ def facade_error_handler(request,exc,traceback): def facade_analysis_init_facade_task(repo_git): logger = logging.getLogger(facade_analysis_init_facade_task.__name__) - with FacadeSession(logger) as session: - - repo = session.query(Repo).filter(Repo.repo_git == repo_git).one() - repo_id = repo.repo_id + facade_helper = FacadeHelper(logger) - session.update_status('Running analysis') - session.log_activity('Info',f"Beginning analysis.") + facade_helper.update_status('Running analysis') + facade_helper.log_activity('Info',f"Beginning analysis.") @celery.task(base=AugurFacadeRepoCollectionTask) @@ -72,107 +69,76 @@ def trim_commits_facade_task(repo_git): logger = logging.getLogger(trim_commits_facade_task.__name__) - with FacadeSession(logger) as session: - - repo = session.query(Repo).filter(Repo.repo_git == repo_git).one() - repo_id = repo.repo_id + facade_helper = FacadeHelper(logger) - def update_analysis_log(repos_id,status): + repo = get_repo_by_repo_git(repo_git) - # Log a repo's analysis status - - log_message = s.sql.text("""INSERT INTO analysis_log (repos_id,status) - VALUES (:repo_id,:status)""").bindparams(repo_id=repos_id,status=status) - - try: - session.execute_sql(log_message) - except: - pass + repo_id = repo.repo_id + facade_helper.inc_repos_processed() + facade_helper.update_analysis_log(repo_id,"Beginning analysis.") + # First we check to see if the previous analysis didn't complete - session.inc_repos_processed() - update_analysis_log(repo_id,"Beginning analysis.") - # First we check to see if the previous analysis didn't complete + working_commits = get_working_commits_by_repo_id(repo_id) - get_status = s.sql.text("""SELECT working_commit FROM working_commits WHERE repos_id=:repo_id - """).bindparams(repo_id=repo_id) - - try: - working_commits = session.fetchall_data_from_sql_text(get_status) - except: - working_commits = [] - - # If there's a commit still there, the previous run was interrupted and - # the commit data may be incomplete. It should be trimmed, just in case. - commits_to_trim = [commit['working_commit'] for commit in working_commits] - - trim_commits(session,repo_id,commits_to_trim) - # Start the main analysis + # If there's a commit still there, the previous run was interrupted and + # the commit data may be incomplete. It should be trimmed, just in case. + commits_to_trim = [commit['working_commit'] for commit in working_commits] + + trim_commits(facade_helper,repo_id,commits_to_trim) + # Start the main analysis - update_analysis_log(repo_id,'Collecting data') - logger.info(f"Got past repo {repo_id}") + facade_helper.update_analysis_log(repo_id,'Collecting data') + logger.info(f"Got past repo {repo_id}") @celery.task(base=AugurFacadeRepoCollectionTask) def trim_commits_post_analysis_facade_task(repo_git): logger = logging.getLogger(trim_commits_post_analysis_facade_task.__name__) + facade_helper = FacadeHelper(logger) - with FacadeSession(logger) as session: - repo = session.query(Repo).filter(Repo.repo_git == repo_git).one() - repo_id = repo.repo_id + repo = repo = get_repo_by_repo_git(repo_git) + repo_id = repo.repo_id - start_date = session.get_setting('start_date') - def update_analysis_log(repos_id,status): - - # Log a repo's analysis status - - log_message = s.sql.text("""INSERT INTO analysis_log (repos_id,status) - VALUES (:repo_id,:status)""").bindparams(repo_id=repos_id,status=status) - - - session.execute_sql(log_message) - - session.logger.info(f"Generating sequence for repo {repo_id}") - - query = session.query(Repo).filter(Repo.repo_id == repo_id) - repo = execute_session_query(query, 'one') - - #Get the huge list of commits to process. - absoulte_path = get_absolute_repo_path(session.repo_base_directory, repo.repo_id, repo.repo_path,repo.repo_name) - repo_loc = (f"{absoulte_path}/.git") - # Grab the parents of HEAD - - parent_commits = get_parent_commits_set(repo_loc, start_date) + start_date = facade_helper.get_setting('start_date') + + logger.info(f"Generating sequence for repo {repo_id}") - # Grab the existing commits from the database - existing_commits = get_existing_commits_set(session, repo_id) + repo = get_repo_by_repo_git(repo_git) - # Find missing commits and add them + #Get the huge list of commits to process. + absolute_path = get_absolute_repo_path(facade_helper.repo_base_directory, repo.repo_id, repo.repo_path,repo.repo_name) + repo_loc = (f"{absolute_path}/.git") + # Grab the parents of HEAD - missing_commits = parent_commits - existing_commits + parent_commits = get_parent_commits_set(repo_loc, start_date) - session.log_activity('Debug',f"Commits missing from repo {repo_id}: {len(missing_commits)}") - - # Find commits which are out of the analysis range + # Grab the existing commits from the database + existing_commits = get_existing_commits_set(repo_id) - trimmed_commits = existing_commits - parent_commits + # Find missing commits and add them - update_analysis_log(repo_id,'Data collection complete') + missing_commits = parent_commits - existing_commits - update_analysis_log(repo_id,'Beginning to trim commits') + facade_helper.log_activity('Debug',f"Commits missing from repo {repo_id}: {len(missing_commits)}") + + # Find commits which are out of the analysis range - session.log_activity('Debug',f"Commits to be trimmed from repo {repo_id}: {len(trimmed_commits)}") + trimmed_commits = existing_commits - parent_commits + facade_helper.update_analysis_log(repo_id,'Data collection complete') + facade_helper.update_analysis_log(repo_id,'Beginning to trim commits') - #for commit in trimmed_commits: - trim_commits(session,repo_id,trimmed_commits) - + facade_helper.log_activity('Debug',f"Commits to be trimmed from repo {repo_id}: {len(trimmed_commits)}") - update_analysis_log(repo_id,'Commit trimming complete') + #for commit in trimmed_commits: + trim_commits(facade_helper,repo_id,trimmed_commits) + + facade_helper.update_analysis_log(repo_id,'Commit trimming complete') - update_analysis_log(repo_id,'Complete') + facade_helper.update_analysis_log(repo_id,'Complete') @@ -180,8 +146,8 @@ def update_analysis_log(repos_id,status): def facade_analysis_end_facade_task(): logger = logging.getLogger(facade_analysis_end_facade_task.__name__) - with FacadeSession(logger) as session: - session.log_activity('Info','Running analysis (complete)') + facade_helper = FacadeHelper(logger) + facade_helper.log_activity('Info','Running analysis (complete)') @@ -189,133 +155,145 @@ def facade_analysis_end_facade_task(): def facade_start_contrib_analysis_task(): logger = logging.getLogger(facade_start_contrib_analysis_task.__name__) - with FacadeSession(logger) as session: - session.update_status('Updating Contributors') - session.log_activity('Info', 'Updating Contributors with commits') + facade_helper = FacadeHelper(logger) + facade_helper.update_status('Updating Contributors') + facade_helper.log_activity('Info', 'Updating Contributors with commits') - -#enable celery multithreading @celery.task(base=AugurFacadeRepoCollectionTask) -def analyze_commits_in_parallel(repo_git, multithreaded: bool)-> None: - """Take a large list of commit data to analyze and store in the database. Meant to be run in parallel with other instances of this task. - """ +def facade_fetch_missing_commit_messages(repo_git): + logger = logging.getLogger(facade_fetch_missing_commit_messages.__name__) + facade_helper = FacadeHelper(logger) - #create new session for celery thread. - logger = logging.getLogger(analyze_commits_in_parallel.__name__) - with FacadeSession(logger) as session: - - repo = session.query(Repo).filter(Repo.repo_git == repo_git).one() - repo_id = repo.repo_id + repo = get_repo_by_repo_git(repo_git) + + logger.debug(f"Fetching missing commit message records for repo {repo_git}") - start_date = session.get_setting('start_date') + missing_message_hashes = get_missing_commit_message_hashes(repo.repo_id) - session.logger.info(f"Generating sequence for repo {repo_id}") - - query = session.query(Repo).filter(Repo.repo_id == repo_id) - repo = execute_session_query(query, 'one') + to_insert = [] + for hash in missing_message_hashes: #Get the huge list of commits to process. - absoulte_path = get_absolute_repo_path(session.repo_base_directory, repo.repo_id, repo.repo_path, repo.repo_name) - repo_loc = (f"{absoulte_path}/.git") - # Grab the parents of HEAD - - parent_commits = get_parent_commits_set(repo_loc, start_date) + logger.debug(f"The hash object is: {hash}. It has a type of: {type(hash)}") - # Grab the existing commits from the database - existing_commits = get_existing_commits_set(session, repo_id) - - # Find missing commits and add them - missing_commits = parent_commits - existing_commits + try: + escaped_hash = hash['cmt_commit_hash'] + except (TypeError, IndexError): + escaped_hash = hash + + absolute_path = get_absolute_repo_path(facade_helper.repo_base_directory, repo.repo_id, repo.repo_path, repo.repo_name) + repo_loc = (f"{absolute_path}/.git") + + try: + commit_message = check_output( + f"git --git-dir {repo_loc} log --format=%B -n 1 {escaped_hash}".split() + #f"git --git-dir {repo_loc} log --format=%B -n 1 {hash}".split() + ).decode('utf-8').strip() + + msg_record = { + 'repo_id' : repo.repo_id, + 'cmt_msg' : commit_message, + #'cmt_hash' : hash, + 'cmt_hash': escaped_hash if isinstance(escaped_hash, str) else escaped_hash['cmt_commit_hash'], + 'tool_source' : 'Facade', + 'tool_version' : '0.78?', + 'data_source' : 'git', + 'data_collection_date' : datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + } + + if len(to_insert) >= 1000: + bulk_insert_dicts(logger,to_insert, CommitMessage, ["repo_id","cmt_hash"]) + to_insert = [] + + to_insert.append(msg_record) + except Exception as e: + logger.info(f'The exception is : {e}.') - session.log_activity('Debug',f"Commits missing from repo {repo_id}: {len(missing_commits)}") + if to_insert: + bulk_insert_dicts(logger, to_insert, CommitMessage, ["repo_id","cmt_hash"]) - - if not len(missing_commits): - #session.log_activity('Info','Type of missing_commits: %s' % type(missing_commits)) - return - - queue = list(missing_commits) - logger.info(f"Got to analysis!") - absoulte_path = get_absolute_repo_path(session.repo_base_directory, repo.repo_id, repo.repo_path,repo.repo_name) - repo_loc = (f"{absoulte_path}/.git") +#enable celery multithreading +@celery.task(base=AugurFacadeRepoCollectionTask) +def analyze_commits_in_parallel(repo_git, multithreaded: bool)-> None: + """Take a large list of commit data to analyze and store in the database. Meant to be run in parallel with other instances of this task. + """ - pendingCommitRecordsToInsert = [] + #create new session for celery thread. + logger = logging.getLogger(analyze_commits_in_parallel.__name__) + facade_helper = FacadeHelper(logger) - for count, commitTuple in enumerate(queue): - quarterQueue = int(len(queue) / 4) + repo = get_repo_by_repo_git(repo_git) + repo_id = repo.repo_id - if quarterQueue == 0: - quarterQueue = 1 # prevent division by zero with integer math + start_date = facade_helper.get_setting('start_date') - #Log progress when another quarter of the queue has been processed - if (count + 1) % quarterQueue == 0: - logger.info(f"Progress through current analysis queue is {(count / len(queue)) * 100}%") + logger.info(f"Generating sequence for repo {repo_id}") + + repo = get_repo_by_repo_id(repo_id) + #Get the huge list of commits to process. + absolute_path = get_absolute_repo_path(facade_helper.repo_base_directory, repo.repo_id, repo.repo_path, repo.repo_name) + repo_loc = (f"{absolute_path}/.git") + # Grab the parents of HEAD - #logger.info(f"Got to analysis!") - commitRecords = analyze_commit(session, repo_id, repo_loc, commitTuple) - #logger.debug(commitRecord) - if len(commitRecords): - pendingCommitRecordsToInsert.extend(commitRecords) - if len(pendingCommitRecordsToInsert) >= 1000: - facade_bulk_insert_commits(session,pendingCommitRecordsToInsert) - pendingCommitRecordsToInsert = [] + parent_commits = get_parent_commits_set(repo_loc, start_date) - - facade_bulk_insert_commits(session,pendingCommitRecordsToInsert) + # Grab the existing commits from the database + existing_commits = get_existing_commits_set(repo_id) - + # Find missing commits and add them + missing_commits = parent_commits - existing_commits + facade_helper.log_activity('Debug',f"Commits missing from repo {repo_id}: {len(missing_commits)}") - # Remove the working commit. - remove_commit = s.sql.text("""DELETE FROM working_commits - WHERE repos_id = :repo_id AND working_commit IN :hashes - """).bindparams(repo_id=repo_id,hashes=tuple(queue)) - session.execute_sql(remove_commit) - logger.info("Analysis complete") - return - -@celery.task -def nuke_affiliations_facade_task(): - - logger = logging.getLogger(nuke_affiliations_facade_task.__name__) + if not len(missing_commits) or repo_id is None: + #session.log_activity('Info','Type of missing_commits: %s' % type(missing_commits)) + return - with FacadeSession(logger) as session: - nuke_affiliations(session) + queue = list(missing_commits) -@celery.task -def fill_empty_affiliations_facade_task(): + logger.info(f"Got to analysis!") + absolute_path = get_absolute_repo_path(facade_helper.repo_base_directory, repo.repo_id, repo.repo_path,repo.repo_name) + repo_loc = (f"{absolute_path}/.git") - logger = logging.getLogger(fill_empty_affiliations_facade_task.__name__) - with FacadeSession(logger) as session: - fill_empty_affiliations(session) + pendingCommitRecordsToInsert = [] + pendingCommitMessageRecordsToInsert = [] -@celery.task -def invalidate_caches_facade_task(): + for count, commitTuple in enumerate(queue): + quarterQueue = int(len(queue) / 4) - logger = logging.getLogger(invalidate_caches_facade_task.__name__) + if quarterQueue == 0: + quarterQueue = 1 # prevent division by zero with integer math - with FacadeSession(logger) as session: - invalidate_caches(session) + #Log progress when another quarter of the queue has been processed + if (count + 1) % quarterQueue == 0: + logger.info(f"Progress through current analysis queue is {(count / len(queue)) * 100}%") -@celery.task -def rebuild_unknown_affiliation_and_web_caches_facade_task(): - - logger = logging.getLogger(rebuild_unknown_affiliation_and_web_caches_facade_task.__name__) + #logger.info(f"Got to analysis!") + commitRecords, commit_msg = analyze_commit(logger, repo_id, repo_loc, commitTuple) + #logger.debug(commitRecord) + if commitRecords: + pendingCommitRecordsToInsert.extend(commitRecords) + if len(pendingCommitRecordsToInsert) >= 1000: + facade_bulk_insert_commits(logger,pendingCommitRecordsToInsert) + pendingCommitRecordsToInsert = [] + + if commit_msg: + pendingCommitMessageRecordsToInsert.append(commit_msg) + + if len(pendingCommitMessageRecordsToInsert) >= 1000: + bulk_insert_dicts(logger,pendingCommitMessageRecordsToInsert, CommitMessage, ["repo_id","cmt_hash"]) - with FacadeSession(logger) as session: - rebuild_unknown_affiliation_and_web_caches(session) + bulk_insert_dicts(logger,pendingCommitMessageRecordsToInsert, CommitMessage, ["repo_id","cmt_hash"]) + facade_bulk_insert_commits(logger,pendingCommitRecordsToInsert) + # Remove the working commit. + remove_working_commits_by_repo_id_and_hashes(repo_id, queue) -@celery.task -def git_repo_cleanup_facade_task(repo_git): - - logger = logging.getLogger(git_repo_cleanup_facade_task.__name__) - - with FacadeSession(logger) as session: - git_repo_cleanup(session, repo_git) + logger.info("Analysis complete") + return # retry this task indefinitely every 5 minutes if it errors. Since the only way it gets scheduled is by itself, so if it stops running no more clones will happen till the instance is restarted @celery.task(autoretry_for=(Exception,), retry_backoff=True, retry_backoff_max=300, retry_jitter=True, max_retries=None) @@ -325,27 +303,29 @@ def clone_repos(): is_pending = CollectionStatus.facade_status == CollectionState.PENDING.value - with FacadeSession(logger) as session: + facade_helper = FacadeHelper(logger) + + with get_session() as session: # process up to 1000 repos at a time repo_git_identifiers = get_collection_status_repo_git_from_filter(session, is_pending, 999999) for repo_git in repo_git_identifiers: # set repo to intializing - repo = session.query(Repo).filter(Repo.repo_git == repo_git).one() + repo = Repo.get_by_repo_git(session, repo_git) repoStatus = repo.collection_status[0] setattr(repoStatus,"facade_status", CollectionState.INITIALIZING.value) session.commit() # clone repo try: - git_repo_initialize(session, repo_git) + git_repo_initialize(facade_helper, session, repo_git) session.commit() # get the commit count - commit_count = get_repo_commit_count(session, repo_git) - facade_weight = get_facade_weight_with_commit_count(session, repo_git, commit_count) + commit_count = get_repo_commit_count(logger, facade_helper, repo_git) + facade_weight = get_facade_weight_with_commit_count(repo_git, commit_count) - update_facade_scheduling_fields(session, repo_git, facade_weight, commit_count) + update_facade_scheduling_fields(repo_git, facade_weight, commit_count) # set repo to update setattr(repoStatus,"facade_status", CollectionState.UPDATE.value) @@ -361,9 +341,7 @@ def clone_repos(): setattr(repoStatus,"facade_status", CollectionState.ERROR.value) session.commit() - clone_repos.si().apply_async(countdown=60*5) - - + clone_repos.si().apply_async(countdown=60*5) #@celery.task(bind=True) @@ -373,7 +351,7 @@ def clone_repos(): # # logger = logging.getLogger(check_for_repo_updates_facade_task.__name__) # -# with FacadeSession(logger) as session: +# facade_helper = FacadeHelper(logger) # check_for_repo_updates(session, repo_git) @celery.task(base=AugurFacadeRepoCollectionTask, bind=True) @@ -383,11 +361,12 @@ def git_update_commit_count_weight(self, repo_git): logger = logging.getLogger(git_update_commit_count_weight.__name__) # Change facade session to take in engine - with FacadeSession(logger) as session: - commit_count = get_repo_commit_count(session, repo_git) - facade_weight = get_facade_weight_with_commit_count(session, repo_git, commit_count) + facade_helper = FacadeHelper(logger) - update_facade_scheduling_fields(session, repo_git, facade_weight, commit_count) + commit_count = get_repo_commit_count(logger, facade_helper, repo_git) + facade_weight = get_facade_weight_with_commit_count(repo_git, commit_count) + + update_facade_scheduling_fields(repo_git, facade_weight, commit_count) @celery.task(base=AugurFacadeRepoCollectionTask) @@ -395,11 +374,12 @@ def git_repo_updates_facade_task(repo_git): logger = logging.getLogger(git_repo_updates_facade_task.__name__) - with FacadeSession(logger) as session: - git_repo_updates(session, repo_git) + facade_helper = FacadeHelper(logger) + + git_repo_updates(facade_helper, repo_git) -def generate_analysis_sequence(logger,repo_git, session): +def generate_analysis_sequence(logger,repo_git, facade_helper): """Run the analysis by looping over all active repos. For each repo, we retrieve the list of commits which lead to HEAD. If any are missing from the database, they are filled in. Then we check to see if any commits in the database are @@ -410,19 +390,16 @@ def generate_analysis_sequence(logger,repo_git, session): commit being analyzed at the time) we can recover. """ - - analysis_sequence = [] - repo_list = s.sql.text("""SELECT repo_id,repo_group_id,repo_path,repo_name FROM repo - WHERE repo_git=:value""").bindparams(value=repo_git) - repos = session.fetchall_data_from_sql_text(repo_list) + #repo_list = s.sql.text("""SELECT repo_id,repo_group_id,repo_path,repo_name FROM repo WHERE repo_git=:value""").bindparams(value=repo_git) + #repos = fetchall_data_from_sql_text(repo_list) - start_date = session.get_setting('start_date') + start_date = facade_helper.get_setting('start_date') - repo_ids = [repo['repo_id'] for repo in repos] + #repo_ids = [repo['repo_id'] for repo in repos] - repo_id = repo_ids.pop(0) + #repo_id = repo_ids.pop(0) analysis_sequence.append(facade_analysis_init_facade_task.si(repo_git)) @@ -432,6 +409,7 @@ def generate_analysis_sequence(logger,repo_git, session): analysis_sequence.append(trim_commits_post_analysis_facade_task.si(repo_git)) + analysis_sequence.append(facade_fetch_missing_commit_messages.si(repo_git)) analysis_sequence.append(facade_analysis_end_facade_task.si()) @@ -440,135 +418,59 @@ def generate_analysis_sequence(logger,repo_git, session): -def generate_contributor_sequence(logger,repo_git, session): - - contributor_sequence = [] - #all_repo_ids = [] - repo_id = None - - #contributor_sequence.append(facade_start_contrib_analysis_task.si()) - query = s.sql.text("""SELECT repo_id FROM repo - WHERE repo_git=:value""").bindparams(value=repo_git) - - repo = session.execute_sql(query).fetchone() - session.logger.info(f"repo: {repo}") - repo_id = repo[0] - #pdb.set_trace() - #breakpoint() - #for repo in all_repos: - # contributor_sequence.append(insert_facade_contributors.si(repo['repo_id'])) - #all_repo_ids = [repo['repo_id'] for repo in all_repos] - - #contrib_group = create_grouped_task_load(dataList=all_repo_ids,task=insert_facade_contributors)#group(contributor_sequence) - #contrib_group.link_error(facade_error_handler.s()) - #return contrib_group#chain(facade_start_contrib_analysis_task.si(), contrib_group) - return insert_facade_contributors.si(repo_id) - - -def facade_phase(repo_git): + +def facade_phase(repo_git, full_collection): logger = logging.getLogger(facade_phase.__name__) logger.info("Generating facade sequence") - with FacadeSession(logger) as session: - #Get the repo_id - repo_list = s.sql.text("""SELECT repo_id,repo_group_id,repo_path,repo_name FROM repo - WHERE repo_git=:value""").bindparams(value=repo_git) - repos = session.fetchall_data_from_sql_text(repo_list) + facade_helper = FacadeHelper(logger) + #Get the repo_id + #repo_list = s.sql.text("""SELECT repo_id,repo_group_id,repo_path,repo_name FROM repo WHERE repo_git=:value""").bindparams(value=repo_git) + #repos = fetchall_data_from_sql_text(repo_list) - start_date = session.get_setting('start_date') + start_date = facade_helper.get_setting('start_date') - repo_ids = [repo['repo_id'] for repo in repos] + #repo_ids = [repo['repo_id'] for repo in repos] - repo_id = repo_ids.pop(0) + #repo_id = repo_ids.pop(0) - #Get the collectionStatus - query = session.query(CollectionStatus).filter(CollectionStatus.repo_id == repo_id) + #Get the collectionStatus + #query = session.query(CollectionStatus).filter(CollectionStatus.repo_id == repo_id) - status = execute_session_query(query,'one') - - # Figure out what we need to do - limited_run = session.limited_run - run_analysis = session.run_analysis - pull_repos = session.pull_repos - #force_analysis = session.force_analysis - run_facade_contributors = session.run_facade_contributors - - facade_sequence = [] - facade_core_collection = [] - - if not limited_run or (limited_run and pull_repos): - facade_core_collection.append(git_repo_updates_facade_task.si(repo_git)) - - facade_core_collection.append(git_update_commit_count_weight.si(repo_git)) + #status = execute_session_query(query,'one') + + # Figure out what we need to do + limited_run = facade_helper.limited_run + run_analysis = facade_helper.run_analysis + pull_repos = facade_helper.pull_repos + #force_analysis = session.force_analysis + run_facade_contributors = facade_helper.run_facade_contributors + + facade_sequence = [] + facade_core_collection = [] + + if not limited_run or (limited_run and pull_repos): + facade_core_collection.append(git_repo_updates_facade_task.si(repo_git)) + + facade_core_collection.append(git_update_commit_count_weight.si(repo_git)) - #Generate commit analysis task order. - if not limited_run or (limited_run and run_analysis): - facade_core_collection.extend(generate_analysis_sequence(logger,repo_git,session)) + #Generate commit analysis task order. + if not limited_run or (limited_run and run_analysis): + facade_core_collection.extend(generate_analysis_sequence(logger,repo_git,facade_helper)) - #Generate contributor analysis task group. - if not limited_run or (limited_run and run_facade_contributors): - facade_core_collection.append(generate_contributor_sequence(logger,repo_git,session)) + #Generate contributor analysis task group. + if not limited_run or (limited_run and run_facade_contributors): + facade_core_collection.append(insert_facade_contributors.si(repo_git)) - #These tasks need repos to be cloned by facade before they can work. - facade_sequence.append( - group( - chain(*facade_core_collection), - process_dependency_metrics.si(repo_git), - process_libyear_dependency_metrics.si(repo_git), - process_scc_value_metrics.si(repo_git) - ) + #These tasks need repos to be cloned by facade before they can work. + facade_sequence.append( + group( + chain(*facade_core_collection), + process_dependency_metrics.si(repo_git), + process_libyear_dependency_metrics.si(repo_git), + process_scc_value_metrics.si(repo_git) ) + ) - logger.info(f"Facade sequence: {facade_sequence}") - return chain(*facade_sequence) - -def generate_non_repo_domain_facade_tasks(logger): - logger.info("Generating facade sequence") - with FacadeSession(logger) as session: - - # Figure out what we need to do - limited_run = session.limited_run - delete_marked_repos = session.delete_marked_repos - pull_repos = session.pull_repos - # clone_repos = session.clone_repos - check_updates = session.check_updates - # force_updates = session.force_updates - run_analysis = session.run_analysis - # force_analysis = session.force_analysis - nuke_stored_affiliations = session.nuke_stored_affiliations - fix_affiliations = session.fix_affiliations - force_invalidate_caches = session.force_invalidate_caches - rebuild_caches = session.rebuild_caches - #if abs((datetime.datetime.strptime(session.cfg.get_setting('aliases_processed')[:-3], - # '%Y-%m-%d %I:%M:%S.%f') - datetime.datetime.now()).total_seconds()) // 3600 > int(session.cfg.get_setting( - # 'update_frequency')) else 0 - force_invalidate_caches = session.force_invalidate_caches - create_xlsx_summary_files = session.create_xlsx_summary_files - multithreaded = session.multithreaded - - facade_sequence = [] - - if nuke_stored_affiliations: - #facade_sequence.append(nuke_affiliations_facade_task.si().on_error(facade_error_handler.s()))#nuke_affiliations(session.cfg) - logger.info("Nuke stored affiliations is deprecated.") - # deprecated because the UI component of facade where affiliations would be - # nuked upon change no longer exists, and this information can easily be derived - # from queries and materialized views in the current version of Augur. - # This method is also a major performance bottleneck with little value. - - #session.logger.info(session.cfg) - if not limited_run or (limited_run and fix_affiliations): - #facade_sequence.append(fill_empty_affiliations_facade_task.si().on_error(facade_error_handler.s()))#fill_empty_affiliations(session) - logger.info("Fill empty affiliations is deprecated.") - # deprecated because the UI component of facade where affiliations would need - # to be fixed upon change no longer exists, and this information can easily be derived - # from queries and materialized views in the current version of Augur. - # This method is also a major performance bottleneck with little value. - - if force_invalidate_caches: - facade_sequence.append(invalidate_caches_facade_task.si().on_error(facade_error_handler.s()))#invalidate_caches(session.cfg) - - if not limited_run or (limited_run and rebuild_caches): - facade_sequence.append(rebuild_unknown_affiliation_and_web_caches_facade_task.si().on_error(facade_error_handler.s()))#rebuild_unknown_affiliation_and_web_caches(session.cfg) - - return facade_sequence + logger.info(f"Facade sequence: {facade_sequence}") + return chain(*facade_sequence) \ No newline at end of file diff --git a/augur/tasks/git/scc_value_tasks/core.py b/augur/tasks/git/scc_value_tasks/core.py index 71993ebcd1..65ff4cb128 100644 --- a/augur/tasks/git/scc_value_tasks/core.py +++ b/augur/tasks/git/scc_value_tasks/core.py @@ -1,24 +1,31 @@ from datetime import datetime import os from augur.application.db.models import * +from augur.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_value from augur.tasks.util.worker_util import parse_json_from_subprocess_call +from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path -def value_model(session,repo_git,repo_id, path): +def value_model(logger,repo_git): """Runs scc on repo and stores data in database :param repo_id: Repository ID - :param path: absolute file path of the Repostiory """ + logger.info(f"repo_git: {repo_git}") - session.logger.info('Generating value data for repo') - session.logger.info(f"Repo ID: {repo_id}, Path: {path}") - session.logger.info('Running scc...') + repo = get_repo_by_repo_git(repo_git) + repo_id = repo.repo_id - path_to_scc = os.environ['HOME'] + '/scc' + path = get_absolute_repo_path(get_value("Facade", "repo_directory"),repo_id,repo.repo_path,repo.repo_name) - required_output = parse_json_from_subprocess_call(session.logger,['./scc', '-f','json','--by-file', path], cwd=path_to_scc) + logger.info('Generating value data for repo') + logger.info(f"Repo ID: {repo_id}, Path: {path}") + logger.info('Running scc...') + + path_to_scc = os.getenv('SCC_DIR', os.environ['HOME'] + '/scc') + + required_output = parse_json_from_subprocess_call(logger,['./scc', '-f','json','--by-file', path], cwd=path_to_scc) - session.logger.info('adding scc data to database... ') - session.logger.debug(f"output: {required_output}") + logger.info('adding scc data to database... ') + logger.debug(f"output: {required_output}") to_insert = [] for record in required_output: @@ -42,6 +49,6 @@ def value_model(session,repo_git,repo_id, path): to_insert.append(repo_labor) - session.insert_data(to_insert, RepoLabor, ["repo_id", "rl_analysis_date", "file_path", "file_name" ]) + bulk_insert_dicts(logger, to_insert, RepoLabor, ["repo_id", "rl_analysis_date", "file_path", "file_name" ]) - session.logger.info(f"Done generating scc data for repo {repo_id} from path {path}") + logger.info(f"Done generating scc data for repo {repo_id} from path {path}") diff --git a/augur/tasks/git/scc_value_tasks/tasks.py b/augur/tasks/git/scc_value_tasks/tasks.py index 37ff4ac4b1..dc0cd94724 100644 --- a/augur/tasks/git/scc_value_tasks/tasks.py +++ b/augur/tasks/git/scc_value_tasks/tasks.py @@ -1,26 +1,13 @@ import logging -from augur.application.db.session import DatabaseSession +from augur.application.db.lib import get_session from augur.tasks.git.scc_value_tasks.core import * from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurFacadeRepoCollectionTask -from augur.application.db.util import execute_session_query -from augur.application.db.lib import get_value -from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path -@celery.task(base=AugurFacadeRepoCollectionTask, bind=True) -def process_scc_value_metrics(self, repo_git): - - engine = self.app.engine +@celery.task(base=AugurFacadeRepoCollectionTask) +def process_scc_value_metrics(repo_git): logger = logging.getLogger(process_scc_value_metrics.__name__) - with DatabaseSession(logger,engine) as session: - logger.info(f"repo_git: {repo_git}") - - query = session.query(Repo).filter(Repo.repo_git == repo_git) - repo = execute_session_query(query, 'one') - - absolute_repo_path = get_absolute_repo_path(get_value("Facade", "repo_directory"),repo.repo_id,repo.repo_path,repo.repo_name) - - value_model(session,repo_git,repo.repo_id, absolute_repo_path) \ No newline at end of file + value_model(logger,repo_git,) \ No newline at end of file diff --git a/augur/tasks/git/util/facade_worker/facade_worker/analyzecommit.py b/augur/tasks/git/util/facade_worker/facade_worker/analyzecommit.py index a0ca29701a..5f8bcd5772 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/analyzecommit.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/analyzecommit.py @@ -25,11 +25,16 @@ # and checks for any parents of HEAD that aren't already accounted for in the # repos. It also rebuilds analysis data, checks any changed affiliations and # aliases, and caches data for display. +import datetime import subprocess +from subprocess import check_output import os import sqlalchemy as s -def analyze_commit(session, repo_id, repo_loc, commit): +from augur.application.db.lib import execute_sql, fetchall_data_from_sql_text +from augur.tasks.init import get_rabbitmq_conn_string + +def analyze_commit(logger, repo_id, repo_loc, commit): # This function analyzes a given commit, counting the additions, removals, and # whitespace changes. It collects all of the metadata about the commit, and @@ -60,7 +65,7 @@ def check_swapped_emails(name,email): # Sometimes people mix up their name and email in their git settings if name.find('@') >= 0 and email.find('@') == -1: - session.logger.debug(f"Found swapped email/name: {email}/{name}") + logger.debug(f"Found swapped email/name: {email}/{name}") return email,name else: return name,email @@ -71,7 +76,7 @@ def strip_extra_amp(email): # matching. This extra info is not used, so we discard it. if email.count('@') > 1: - session.logger.debug(f"Found extra @: {email}") + logger.debug(f"Found extra @: {email}") return email[:email.find('@',email.find('@')+1)] else: return email @@ -84,7 +89,7 @@ def discover_alias(email): WHERE alias_email=:alias_email AND cntrb_active = 1""").bindparams(alias_email=email) - canonical = session.fetchall_data_from_sql_text(fetch_canonical)#list(cursor_people_local) + canonical = fetchall_data_from_sql_text(fetch_canonical)#list(cursor_people_local) if canonical: for email in canonical: @@ -111,7 +116,7 @@ def generate_commit_record(repos_id,commit,filename, #2021-10-11 11:57:46 -0500 placeholder_date = "1970-01-01 00:00:15 -0500" - #session.logger.info(f"Timestamp: {author_timestamp}") + #logger.info(f"Timestamp: {author_timestamp}") commit_record = { 'repo_id' : repos_id, 'cmt_commit_hash' : str(commit), @@ -173,7 +178,26 @@ def generate_commit_record(repos_id,commit,filename, #cursor_local.execute(store_working_commit, (repo_id,commit)) #db_local.commit() - session.execute_sql(store_working_commit) + execute_sql(store_working_commit) + + # commit_message = check_output( + # f"git --git-dir {repo_loc} log --format=%B -n 1 {commit}".split() + # ).strip() + + commit_message = check_output( + f"git --git-dir {repo_loc} log --format=%B -n 1 {commit}".split() + ).decode('utf-8').strip() + + msg_record = { + 'repo_id' : repo_id, + 'cmt_msg' : commit_message, + 'cmt_hash' : commit, + 'tool_source' : 'Facade', + 'tool_version' : '0.78?', + 'data_source' : 'git', + 'data_collection_date' : datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + } + #session.log_activity('Debug',f"Stored working commit and analyzing : {commit}") @@ -312,4 +336,4 @@ def generate_commit_record(repos_id,commit,filename, added,removed,whitespace)) - return recordsToInsert + return recordsToInsert, msg_record diff --git a/augur/tasks/git/util/facade_worker/facade_worker/config.py b/augur/tasks/git/util/facade_worker/facade_worker/config.py index 19539d79de..c62034a94e 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/config.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/config.py @@ -34,8 +34,9 @@ from sqlalchemy.exc import OperationalError from psycopg2.errors import DeadlockDetected -from augur.tasks.github.util.github_task_session import * -from augur.application.config import AugurConfig +from augur.application.db.session import DatabaseSession +from augur.application.db.lib import execute_sql +from augur.application.db.lib import get_section from logging import Logger logger = logging.getLogger(__name__) @@ -77,7 +78,7 @@ def get_database_args_from_env(): #print(credentials) return credentials -class FacadeSession(GithubTaskSession): +class FacadeHelper(): """ORM session used in facade tasks. This class adds the various attributes needed for legacy facade as well as a modified version of the legacy FacadeConfig class. @@ -104,12 +105,12 @@ def __init__(self,logger: Logger): from augur.application.db import get_engine engine = get_engine() - #self.cfg = FacadeConfig(logger) self.repos_processed = 0 - super().__init__(logger=logger, engine=engine) - # Figure out what we need to do + # super().__init__(logger=logger, engine=engine) + + self.logger = logger - worker_options = AugurConfig(logger, self).get_section("Facade") + worker_options = get_section("Facade") self.limited_run = worker_options["limited_run"] self.delete_marked_repos = worker_options["delete_marked_repos"] @@ -150,7 +151,7 @@ def get_setting(self,setting): query = s.sql.text("""SELECT value FROM settings WHERE setting=:settingParam ORDER BY last_modified DESC LIMIT 1""").bindparams(settingParam=setting) - result = self.execute_sql(query).fetchone() + result = execute_sql(query).fetchone() print(result) return result[0] @@ -159,7 +160,7 @@ def update_status(self, status): query = s.sql.text("""UPDATE settings SET value=:statusParam WHERE setting='utility_status' """).bindparams(statusParam=status) - self.execute_sql(query) + execute_sql(query) def log_activity(self, level, status): # Log an activity based upon urgency and user's preference. If the log level is @@ -176,7 +177,7 @@ def log_activity(self, level, status): """).bindparams(levelParam=level,statusParam=status) try: - self.execute_sql(query) + execute_sql(query) except Exception as e: self.logger.error(f"Error encountered: {e}") raise e @@ -187,9 +188,19 @@ def update_repo_log(self,repos_id,status): VALUES (:repo_id,:repo_status)""").bindparams(repo_id=repos_id,repo_status=status) try: - self.execute_sql(log_message) + execute_sql(log_message) except: pass + + def update_analysis_log(self, repos_id,status): + + # Log a repo's analysis status + + log_message = s.sql.text("""INSERT INTO analysis_log (repos_id,status) + VALUES (:repo_id,:status)""").bindparams(repo_id=repos_id,status=status) + + execute_sql(log_message) + def insert_or_update_data(self, query, **bind_args)-> None: """Provide deadlock detection for postgres updates, inserts, and deletions for facade. @@ -206,9 +217,9 @@ def insert_or_update_data(self, query, **bind_args)-> None: try: if bind_args: #self.cfg.cursor.execute(query, params) - self.execute_sql(query.bindparams(**bind_args)) + execute_sql(query.bindparams(**bind_args)) else: - self.execute_sql(query) + execute_sql(query) break except OperationalError as e: # print(str(e).split("Process")[1].split(";")[0]) diff --git a/augur/tasks/git/util/facade_worker/facade_worker/facade00mainprogram.py b/augur/tasks/git/util/facade_worker/facade_worker/facade00mainprogram.py index b41c6f14da..1811c734f6 100755 --- a/augur/tasks/git/util/facade_worker/facade_worker/facade00mainprogram.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/facade00mainprogram.py @@ -27,7 +27,7 @@ # aliases, and caches data for display. from __future__ import annotations import html.parser -from .config import FacadeSession as FacadeSession +from .config import FacadeHelper as FacadeHelper #.facade06analyze analysis moved to facade_tasks.py - IM 10/12/22 #from contributor_interfaceable.facade08contributorinterfaceable import ContributorInterfaceable diff --git a/augur/tasks/git/util/facade_worker/facade_worker/postanalysiscleanup.py b/augur/tasks/git/util/facade_worker/facade_worker/postanalysiscleanup.py deleted file mode 100644 index 3ec2013274..0000000000 --- a/augur/tasks/git/util/facade_worker/facade_worker/postanalysiscleanup.py +++ /dev/null @@ -1,183 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2016-2018 Brian Warner -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# SPDX-License-Identifier: Apache-2.0 - -# Git repo maintenance -# -# This script is responsible for cloning new repos and keeping existing repos up -# to date. It can be run as often as you want (and will detect when it's -# already running, so as not to spawn parallel processes), but once or twice per -# day should be more than sufficient. Each time it runs, it updates the repo -# and checks for any parents of HEAD that aren't already accounted for in the -# repos. It also rebuilds analysis data, checks any changed affiliations and -# aliases, and caches data for display. -import subprocess -import sqlalchemy as s -from augur.application.db.util import execute_session_query -from .utilitymethods import get_absolute_repo_path -from augur.application.db.models import * - -#Will delete repos passed and cleanup associated commit data. -def git_repo_cleanup(session,repo_git): - -# Clean up any git repos that are pending deletion - - session.update_status('Purging deleted repos') - #session.logger.info("Processing deletions") - session.log_activity('Info','Processing deletions') - - - query = session.query(Repo).filter( - Repo.repo_git == repo_git)#s.sql.text("""SELECT repo_id,repo_group_id,repo_path,repo_name FROM repo WHERE repo_status='Delete'""") - - delete_repos = execute_session_query(query,'all')#session.fetchall_data_from_sql_text(query) - - for row in delete_repos: - - # Remove the files on disk - - absolute_path = get_absolute_repo_path(session.repo_base_directory, row.repo_id, row.repo_path,row.repo_name) - - cmd = ("rm -rf %s" - % (absolute_path)) - - return_code = subprocess.Popen([cmd],shell=True).wait() - - # Remove the analysis data - - remove_commits = s.sql.text("""DELETE FROM commits WHERE repo_id=:repo_id - """).bindparams(repo_id=row.repo_id) - session.execute_sql(remove_commits) - - optimize_table = s.sql.text("""OPTIMIZE TABLE commits""") - session.execute_sql(optimize_table) - - # Remove cached repo data - - remove_dm_repo_weekly = s.sql.text("""DELETE FROM dm_repo_weekly WHERE repo_id=:repo_id - """).bindparams(repo_id=row.repo_id) - session.execute_sql(remove_dm_repo_weekly) - - optimize_table = s.sql.text("""OPTIMIZE TABLE dm_repo_weekly""") - session.execute_sql(optimize_table) - - remove_dm_repo_monthly = s.sql.text("""DELETE FROM dm_repo_monthly WHERE repo_id=:repo_id - """).bindparams(repo_id=row.repo_id) - session.execute_sql(remove_dm_repo_monthly) - - optimize_table = s.sql.text("""OPTIMIZE TABLE dm_repo_monthly""") - session.execute_sql(optimize_table) - - remove_dm_repo_annual = s.sql.text("""DELETE FROM dm_repo_annual WHERE repo_id=:repo_id - """).bindparams(repo_id=row.repo_id) - session.execute_sql(remove_dm_repo_annual) - - optimize_table = s.sql.text("""OPTIMIZE TABLE dm_repo_annual""") - session.execute_sql(optimize_table) - - # Set project to be recached if just removing a repo - - set_project_recache = s.sql.text("""UPDATE projects SET recache=TRUE - WHERE id=:repo_group_id""").bindparams(repo_group_id=row.repo_group_id) - session.execute_sql(set_project_recache) - # Remove the entry from the repos table - - query = s.sql.text("""DELETE FROM repo WHERE repo_id=:repo_id - """).bindparams(repo_id=row.repo_id) - session.execute_sql(query) - - #log_activity('Verbose','Deleted repo %s' % row[0]) - #session.logger.debug(f"Deleted repo {row.repo_id}") - session.log_activity('Verbose',f"Deleted repo {row.repo_id}") - cleanup = '%s/%s%s' % (row.repo_group_id,row.repo_path,row.repo_name) - - # Remove any working commits - - remove_working_commits = s.sql.text("""DELETE FROM working_commits WHERE repos_id=:repo_id - """).bindparams(repo_id=row.repo_id) - session.execute_sql(remove_working_commits) - - # Remove the repo from the logs - - remove_logs = s.sql.text("""DELETE FROM repos_fetch_log WHERE repos_id =:repo_id - """).bindparams(repo_id=row.repo_id) - - session.execute_sql(remove_logs) - - optimize_table = s.sql.text("""OPTIMIZE TABLE repos_fetch_log""") - session.execute_sql(optimize_table) - - # Attempt to cleanup any empty parent directories - - while (cleanup.find('/',0) > 0): - cleanup = cleanup[:cleanup.rfind('/',0)] - - cmd = "rmdir %s%s" % (session.repo_base_directory,cleanup) - subprocess.Popen([cmd],shell=True).wait() - #log_activity('Verbose','Attempted %s' % cmd) - #session.logger.debug(f"Attempted {cmd}") - session.log_activity('Verbose',f"Attempted {cmd}") - - #update_repo_log(row[0],'Deleted') - session.update_repo_log(row.repo_id,'Deleted') - - # Clean up deleted projects - - get_deleted_projects = s.sql.text("""SELECT repo_group_id FROM repo_groups WHERE rg_name='(Queued for removal)'""") - - deleted_projects = session.fetchall_data_from_sql_text(get_deleted_projects) - - for project in deleted_projects: - - # Remove cached data for projects which were marked for deletion - - clear_annual_cache = s.sql.text("""DELETE FROM dm_repo_group_annual WHERE - repo_group_id=:repo_group_id""").bindparams(repo_group_id=project['repo_group_id']) - session.execute_sql(clear_annual_cache) - - optimize_table = s.sql.text("""OPTIMIZE TABLE dm_repo_group_annual""") - session.execute_sql(optimize_table) - - clear_monthly_cache = s.sql.text("""DELETE FROM dm_repo_group_monthly WHERE - repo_group_id=:repo_group_id""").bindparams(repo_group_id=project['repo_group_id']) - session.execute_sql(clear_monthly_cache) - - optimize_table = s.sql.text("""OPTIMIZE TABLE dm_repo_group_monthly""") - session.execute_sql(optimize_table) - - clear_weekly_cache = s.sql.text("""DELETE FROM dm_repo_group_weekly WHERE - repo_group_id=:repo_group_id""").bindparams(repo_group_id=project['repo_group_id']) - session.execute_sql(clear_weekly_cache) - - optimize_table = s.sql.text("""OPTIMIZE TABLE dm_repo_group_weekly""") - session.execute_sql(optimize_table) - - clear_unknown_cache = s.sql.text("""DELETE FROM unknown_cache WHERE - projects_id=:repo_group_id""").bindparams(repo_group_id=project['repo_group_id']) - session.execute_sql(clear_unknown_cache) - - optimize_table = s.sql.text("""OPTIMIZE TABLE dm_repo_group_weekly""") - session.execute_sql(optimize_table) - - # Remove any projects which were also marked for deletion - - remove_project = s.sql.text("""DELETE FROM repo_groups WHERE repo_group_id=:repo_group_id - """).bindparams(repo_group_id=project['repo_group_id']) - session.execute_sql(remove_project) - - - session.log_activity('Info', 'Processing deletions (complete)') diff --git a/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py b/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py index 5668739767..d92f17b692 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py @@ -26,13 +26,14 @@ # repos. It also rebuilds analysis data, checks any changed affiliations and # aliases, and caches data for display. import sqlalchemy as s +from augur.application.db.lib import execute_sql, fetchall_data_from_sql_text from .utilitymethods import store_working_author, trim_author # if platform.python_implementation() == 'PyPy': # import pymysql # else: # import MySQLdb -def nuke_affiliations(session): +def nuke_affiliations(facade_helper): # Delete all stored affiliations in the database. Normally when you # add/remove/change affiliation data via the web UI, any potentially affected @@ -42,16 +43,16 @@ def nuke_affiliations(session): # this is the scorched earth way: remove them all to force a total rebuild. # Brutal but effective. - session.log_activity('Info','Nuking affiliations') + facade_helper.log_activity('Info','Nuking affiliations') nuke = s.sql.text("""UPDATE commits SET cmt_author_affiliation = NULL, cmt_committer_affiliation = NULL""") - session.execute_sql(nuke) + execute_sql(nuke) - session.log_activity('Info','Nuking affiliations (complete)') + facade_helper.log_activity('Info','Nuking affiliations (complete)') -def fill_empty_affiliations(session): +def fill_empty_affiliations(facade_helper): @@ -79,13 +80,13 @@ def discover_null_affiliations(attribution,email): - matches = session.fetchall_data_from_sql_text(find_exact_match)#list(cfg.cursor) + matches = fetchall_data_from_sql_text(find_exact_match)#list(cfg.cursor) if not matches and email.find('@') < 0: # It's not a properly formatted email, leave it NULL and log it. - session.log_activity('Info',f"Unmatchable email: {email}") + facade_helper.log_activity('Info',f"Unmatchable email: {email}") return @@ -104,7 +105,7 @@ def discover_null_affiliations(attribution,email): - matches = session.fetchall_data_from_sql_text(find_exact_domain) + matches = fetchall_data_from_sql_text(find_exact_domain) if not matches: @@ -117,7 +118,7 @@ def discover_null_affiliations(attribution,email): ORDER BY ca_start_date DESC""").bindparams(strippedDomain=domain[domain.rfind('.',0,domain.rfind('.',0))+1:]) - matches = session.fetchall_data_from_sql_text(find_domain)#list(cfg.cursor) + matches = fetchall_data_from_sql_text(find_domain)#list(cfg.cursor) if not matches: @@ -130,7 +131,7 @@ def discover_null_affiliations(attribution,email): if matches: - session.log_activity('Debug',f"Found domain match for {email}") + facade_helper.log_activity('Debug',f"Found domain match for {email}") for match in matches: update = s.sql.text(("UPDATE commits " @@ -140,14 +141,14 @@ def discover_null_affiliations(attribution,email): f"AND cmt_{attribution}_date::date >= \'{match['ca_start_date']}\'::date") ).bindparams(affiliation=match['ca_affiliation'],email=email) - session.log_activity('Info', f"attr: {attribution} \nmatch:{match}\nsql: {update}") + facade_helper.log_activity('Info', f"attr: {attribution} \nmatch:{match}\nsql: {update}") try: - session.execute_sql(update) + execute_sql(update) except Exception as e: - session.log_activity('Info', f"Error encountered: {e}") - session.log_activity('Info', f"Affiliation insertion failed for {email} ") - session.log_activity('Info', f"Offending query: {update} ") + facade_helper.log_activity('Info', f"Error encountered: {e}") + facade_helper.log_activity('Info', f"Affiliation insertion failed for {email} ") + facade_helper.log_activity('Info', f"Offending query: {update} ") def discover_alias(email): @@ -158,7 +159,7 @@ def discover_alias(email): WHERE alias_email=:email AND cntrb_active = 1""").bindparams(email=email) - canonical = session.fetchall_data_from_sql_text(fetch_canonical)#list(cfg.cursor) + canonical = fetchall_data_from_sql_text(fetch_canonical)#list(cfg.cursor) if canonical: for email in canonical: @@ -168,8 +169,8 @@ def discover_alias(email): ### The real function starts here ### - session.update_status('Filling empty affiliations') - session.log_activity('Info','Filling empty affiliations') + facade_helper.update_status('Filling empty affiliations') + facade_helper.log_activity('Info','Filling empty affiliations') # Process any changes to the affiliations or aliases, and set any existing # entries in commits to NULL so they are filled properly. @@ -178,41 +179,41 @@ def discover_alias(email): timefetch = s.sql.text("""SELECT current_timestamp(6) as fetched""") - affiliations_fetched = session.execute_sql(timefetch).fetchone()[0] + affiliations_fetched = execute_sql(timefetch).fetchone()[0] print(affiliations_fetched) # Now find the last time we worked on affiliations, to figure out what's new - affiliations_processed = session.get_setting('affiliations_processed') + affiliations_processed = facade_helper.get_setting('affiliations_processed') get_changed_affiliations = s.sql.text("""SELECT ca_domain FROM contributor_affiliations""")# WHERE " #"ca_last_used >= timestamptz %s") - changed_affiliations = session.fetchall_data_from_sql_text(get_changed_affiliations)#list(cfg.cursor) + changed_affiliations = fetchall_data_from_sql_text(get_changed_affiliations)#list(cfg.cursor) # Process any affiliations which changed since we last checked for changed_affiliation in changed_affiliations: - session.log_activity('Debug',f"Resetting affiliation for {changed_affiliation['ca_domain']}") + facade_helper.log_activity('Debug',f"Resetting affiliation for {changed_affiliation['ca_domain']}") set_author_to_null = s.sql.text("""UPDATE commits SET cmt_author_affiliation = NULL WHERE cmt_author_email LIKE CONCAT('%%',:affiliation)""").bindparams(affiliation=changed_affiliation['ca_domain']) - session.execute_sql(set_author_to_null) + execute_sql(set_author_to_null) set_committer_to_null = s.sql.text("""UPDATE commits SET cmt_committer_affiliation = NULL WHERE cmt_committer_email LIKE CONCAT('%%',:affiliation)""").bindparams(affiliation=changed_affiliation['ca_domain']) - session.execute_sql(set_committer_to_null) + execute_sql(set_committer_to_null) # Update the last fetched date, so we know where to start next time. update_affiliations_date = s.sql.text("""UPDATE settings SET value=:affiliations WHERE setting = 'affiliations_processed'""").bindparams(affiliations=affiliations_fetched) - session.execute_sql(update_affiliations_date) + execute_sql(update_affiliations_date) # On to the aliases, now @@ -220,61 +221,61 @@ def discover_alias(email): get_time = s.sql.text("""SELECT current_timestamp(6) as fetched""") - aliases_fetched = session.execute_sql(get_time).fetchone()[0]#['fetched'] + aliases_fetched = execute_sql(get_time).fetchone()[0]#['fetched'] # Now find the last time we worked on aliases, to figure out what's new - aliases_processed = session.get_setting('aliases_processed') + aliases_processed = facade_helper.get_setting('aliases_processed') get_changed_aliases = s.sql.text("""SELECT alias_email FROM contributors_aliases WHERE cntrb_last_modified >= :aliases""").bindparams(aliases=aliases_processed) - changed_aliases = session.fetchall_data_from_sql_text(get_changed_aliases)#list(cfg.cursor) + changed_aliases = fetchall_data_from_sql_text(get_changed_aliases)#list(cfg.cursor) # Process any aliases which changed since we last checked for changed_alias in changed_aliases: - session.log_activity('Debug',f"Resetting affiliation for {changed_alias['alias_email']}") + facade_helper.log_activity('Debug',f"Resetting affiliation for {changed_alias['alias_email']}") set_author_to_null = s.sql.text("""UPDATE commits SET cmt_author_affiliation = NULL WHERE cmt_author_raw_email LIKE CONCAT('%%',:alias)""").bindparams(alias=changed_alias['alias_email']) - session.insert_or_update_data(set_author_to_null) + facade_helper.insert_or_update_data(set_author_to_null) set_committer_to_null = s.sql.text("""UPDATE commits SET cmt_committer_affiliation = NULL WHERE cmt_committer_raw_email LIKE CONCAT('%%',:alias_email)""").bindparams(alias_email=changed_alias['alias_email']) - session.insert_or_update_data(set_committer_to_null) + facade_helper.insert_or_update_data(set_committer_to_null) reset_author = s.sql.text("""UPDATE commits SET cmt_author_email = :author_email WHERE cmt_author_raw_email = :raw_author_email """).bindparams(author_email=discover_alias(changed_alias['alias_email']),raw_author_email=changed_alias['alias_email']) - session.insert_or_update_data(reset_author) + facade_helper.insert_or_update_data(reset_author) reset_committer = s.sql.text("""UPDATE commits SET cmt_committer_email = :author_email WHERE cmt_committer_raw_email = :raw_author_email """).bindparams(author_email=discover_alias(changed_alias['alias_email']), raw_author_email=changed_alias['alias_email']) - session.insert_or_update_data(reset_committer) + facade_helper.insert_or_update_data(reset_committer) # Update the last fetched date, so we know where to start next time. update_aliases_date = s.sql.text("""UPDATE settings SET value=:aliases WHERE setting = 'aliases_processed'""").bindparams(aliases=aliases_fetched) - session.execute_sql(update_aliases_date) + execute_sql(update_aliases_date) # Now rebuild the affiliation data - working_author = session.get_setting('working_author') + working_author = facade_helper.get_setting('working_author') if working_author != 'done': - session.log_activity('Error',f"Trimming author data in affiliations: {working_author}") - trim_author(session, working_author) + facade_helper.log_activity('Error',f"Trimming author data in affiliations: {working_author}") + trim_author(facade_helper, working_author) # Figure out which projects have NULL affiliations so they can be recached @@ -294,7 +295,7 @@ def discover_alias(email): # "SET rg_recache=TRUE WHERE " # "author_affiliation IS NULL OR " # "committer_affiliation IS NULL") - session.execute_sql(set_recache) + execute_sql(set_recache) # Find any authors with NULL affiliations and fill them @@ -304,19 +305,19 @@ def discover_alias(email): WHERE cmt_author_affiliation IS NULL GROUP BY cmt_author_email""") - null_authors = session.fetchall_data_from_sql_text(find_null_authors) + null_authors = fetchall_data_from_sql_text(find_null_authors) - session.log_activity('Debug',f"Found {len(null_authors)} authors with NULL affiliation") + facade_helper.log_activity('Debug',f"Found {len(null_authors)} authors with NULL affiliation") for null_author in null_authors: email = null_author['email'] - store_working_author(session, email) + store_working_author(facade_helper, email) discover_null_affiliations('author',email) - store_working_author(session, 'done') + store_working_author(facade_helper, 'done') # Find any committers with NULL affiliations and fill them @@ -326,15 +327,15 @@ def discover_alias(email): WHERE cmt_committer_affiliation IS NULL GROUP BY cmt_committer_email""") - null_committers = session.fetchall_data_from_sql_text(find_null_committers) + null_committers = fetchall_data_from_sql_text(find_null_committers) - session.log_activity('Debug',f"Found {len(null_committers)} committers with NULL affiliation") + facade_helper.log_activity('Debug',f"Found {len(null_committers)} committers with NULL affiliation") for null_committer in null_committers: email = null_committer['email'] - store_working_author(session, email) + store_working_author(facade_helper, email) discover_null_affiliations('committer',email) @@ -344,43 +345,43 @@ def discover_alias(email): SET cmt_author_affiliation = '(Unknown)' WHERE cmt_author_affiliation IS NULL""") - session.execute_sql(fill_unknown_author) + execute_sql(fill_unknown_author) fill_unknown_committer = s.sql.text("""UPDATE commits SET cmt_committer_affiliation = '(Unknown)' WHERE cmt_committer_affiliation IS NULL""") - session.execute_sql(fill_unknown_committer) + execute_sql(fill_unknown_committer) - store_working_author(session, 'done') + store_working_author(facade_helper, 'done') - session.log_activity('Info','Filling empty affiliations (complete)') + facade_helper.log_activity('Info','Filling empty affiliations (complete)') -def invalidate_caches(session): +def invalidate_caches(facade_helper): # Invalidate all caches - session.update_status('Invalidating caches') - session.log_activity('Info','Invalidating caches') + facade_helper.update_status('Invalidating caches') + facade_helper.log_activity('Info','Invalidating caches') invalidate_cache = s.sql.text("""UPDATE repo_groups SET rg_recache = 1""") - session.execute_sql(invalidate_cache) + execute_sql(invalidate_cache) - session.log_activity('Info','Invalidating caches (complete)') + facade_helper.log_activity('Info','Invalidating caches (complete)') -def rebuild_unknown_affiliation_and_web_caches(session): +def rebuild_unknown_affiliation_and_web_caches(facade_helper): # When there's a lot of analysis data, calculating display data on the fly gets # pretty expensive. Instead, we crunch the data based upon the user's preferred # statistics (author or committer) and store them. We also store all records # with an (Unknown) affiliation for display to the user. - session.update_status('Caching data for display') - session.log_activity('Info','Caching unknown affiliations and web data for display') + facade_helper.update_status('Caching data for display') + facade_helper.log_activity('Info','Caching unknown affiliations and web data for display') - report_date = session.get_setting('report_date') - report_attribution = session.get_setting('report_attribution') + report_date = facade_helper.get_setting('report_date') + report_attribution = facade_helper.get_setting('report_attribution') # Clear stale caches @@ -396,7 +397,8 @@ def rebuild_unknown_affiliation_and_web_caches(session): # ("DELETE c.* FROM dm_repo_group_weekly c " # "JOIN repo_groups p ON c.repo_group_id = p.repo_group_id WHERE " # "p.rg_recache=TRUE") - session.execute_sql(clear_dm_repo_group_weekly) + +# session.execute_sql(clear_dm_repo_group_weekly) clear_dm_repo_group_monthly = s.sql.text(""" DELETE @@ -410,7 +412,8 @@ def rebuild_unknown_affiliation_and_web_caches(session): # ("DELETE c.* FROM dm_repo_group_monthly c " # "JOIN repo_groups p ON c.repo_group_id = p.repo_group_id WHERE " # "p.rg_recache=TRUE") - session.execute_sql(clear_dm_repo_group_monthly) + +# session.execute_sql(clear_dm_repo_group_monthly) clear_dm_repo_group_annual = s.sql.text(""" DELETE @@ -424,7 +427,7 @@ def rebuild_unknown_affiliation_and_web_caches(session): # ("DELETE c.* FROM dm_repo_group_annual c " # "JOIN repo_groups p ON c.repo_group_id = p.repo_group_id WHERE " # "p.rg_recache=TRUE") - session.execute_sql(clear_dm_repo_group_annual) +# session.execute_sql(clear_dm_repo_group_annual) clear_dm_repo_weekly = s.sql.text(""" DELETE @@ -441,7 +444,7 @@ def rebuild_unknown_affiliation_and_web_caches(session): # "JOIN repo r ON c.repo_id = r.repo_id " # "JOIN repo_groups p ON r.repo_group_id = p.repo_group_id WHERE " # "p.rg_recache=TRUE") - session.execute_sql(clear_dm_repo_weekly) +# session.execute_sql(clear_dm_repo_weekly) clear_dm_repo_monthly = s.sql.text(""" DELETE @@ -458,7 +461,7 @@ def rebuild_unknown_affiliation_and_web_caches(session): # "JOIN repo r ON c.repo_id = r.repo_id " # "JOIN repo_groups p ON r.repo_group_id = p.repo_group_id WHERE " # "p.rg_recache=TRUE") - session.execute_sql(clear_dm_repo_monthly) +# session.execute_sql(clear_dm_repo_monthly) clear_dm_repo_annual = s.sql.text(""" DELETE @@ -475,7 +478,7 @@ def rebuild_unknown_affiliation_and_web_caches(session): # "JOIN repo r ON c.repo_id = r.repo_id " # "JOIN repo_groups p ON r.repo_group_id = p.repo_group_id WHERE " # "p.rg_recache=TRUE") - session.execute_sql(clear_dm_repo_annual) +# session.execute_sql(clear_dm_repo_annual) clear_unknown_cache = s.sql.text(""" DELETE @@ -489,9 +492,9 @@ def rebuild_unknown_affiliation_and_web_caches(session): # ("DELETE c.* FROM unknown_cache c " # "JOIN repo_groups p ON c.repo_group_id = p.repo_group_id WHERE " # "p.rg_recache=TRUE") - session.execute_sql(clear_unknown_cache) + execute_sql(clear_unknown_cache) - session.log_activity('Verbose','Caching unknown authors and committers') + facade_helper.log_activity('Verbose','Caching unknown authors and committers') # Cache the unknown authors @@ -511,9 +514,9 @@ def rebuild_unknown_affiliation_and_web_caches(session): AND p.rg_recache = 1 GROUP BY r.repo_group_id,a.cmt_author_email, info.a, info.b, info.c - """).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source) + """).bindparams(tool_source=facade_helper.tool_source,tool_version=facade_helper.tool_version,data_source=facade_helper.data_source) - session.execute_sql(unknown_authors) + execute_sql(unknown_authors) # Cache the unknown committers @@ -531,13 +534,13 @@ def rebuild_unknown_affiliation_and_web_caches(session): WHERE a.cmt_committer_affiliation = '(Unknown)' AND p.rg_recache = 1 GROUP BY r.repo_group_id,a.cmt_committer_email, info.a, info.b, info.c - """).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source) + """).bindparams(tool_source=facade_helper.tool_source,tool_version=facade_helper.tool_version,data_source=facade_helper.data_source) - session.execute_sql(unknown_committers) + execute_sql(unknown_committers) # Start caching by project - session.log_activity('Verbose','Caching projects') + facade_helper.log_activity('Verbose','Caching projects') cache_projects_by_week = s.sql.text(( "INSERT INTO dm_repo_group_weekly (repo_group_id, email, affiliation, week, year, added, removed, whitespace, files, patches, tool_source, tool_version, data_source)" @@ -571,9 +574,9 @@ def rebuild_unknown_affiliation_and_web_caches(session): "affiliation, " f"a.cmt_{report_attribution}_email, " "r.repo_group_id, info.a, info.b, info.c") - ).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source) + ).bindparams(tool_source=facade_helper.tool_source,tool_version=facade_helper.tool_version,data_source=facade_helper.data_source) - session.execute_sql(cache_projects_by_week) +# session.execute_sql(cache_projects_by_week) cache_projects_by_month = s.sql.text( ("INSERT INTO dm_repo_group_monthly (repo_group_id, email, affiliation, month, year, added, removed, whitespace, files, patches, tool_source, tool_version, data_source) " @@ -607,9 +610,9 @@ def rebuild_unknown_affiliation_and_web_caches(session): "affiliation, " f"a.cmt_{report_attribution}_email," "r.repo_group_id, info.a, info.b, info.c" - )).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source) + )).bindparams(tool_source=facade_helper.tool_source,tool_version=facade_helper.tool_version,data_source=facade_helper.data_source) - session.execute_sql(cache_projects_by_month) +# session.execute_sql(cache_projects_by_month) cache_projects_by_year = s.sql.text(( "INSERT INTO dm_repo_group_annual (repo_group_id, email, affiliation, year, added, removed, whitespace, files, patches, tool_source, tool_version, data_source) " @@ -644,15 +647,15 @@ def rebuild_unknown_affiliation_and_web_caches(session): - )).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source) + )).bindparams(tool_source=facade_helper.tool_source,tool_version=facade_helper.tool_version,data_source=facade_helper.data_source) - session.execute_sql(cache_projects_by_year) + # session.execute_sql(cache_projects_by_year) # Start caching by repo - session.log_activity('Verbose','Caching repos') + facade_helper.log_activity('Verbose','Caching repos') cache_repos_by_week = s.sql.text( ( @@ -687,9 +690,9 @@ def rebuild_unknown_affiliation_and_web_caches(session): "affiliation, " f"a.cmt_{report_attribution}_email," "a.repo_id, info.a, info.b, info.c" - )).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source) + )).bindparams(tool_source=facade_helper.tool_source,tool_version=facade_helper.tool_version,data_source=facade_helper.data_source) - session.execute_sql(cache_repos_by_week) +# session.execute_sql(cache_repos_by_week) cache_repos_by_month = s.sql.text(( "INSERT INTO dm_repo_monthly (repo_id, email, affiliation, month, year, added, removed, whitespace, files, patches, tool_source, tool_version, data_source)" @@ -723,9 +726,9 @@ def rebuild_unknown_affiliation_and_web_caches(session): "affiliation, " f"a.cmt_{report_attribution}_email," "a.repo_id, info.a, info.b, info.c" - )).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source) + )).bindparams(tool_source=facade_helper.tool_source,tool_version=facade_helper.tool_version,data_source=facade_helper.data_source) - session.execute_sql(cache_repos_by_month) +# session.execute_sql(cache_repos_by_month) cache_repos_by_year = s.sql.text(( "INSERT INTO dm_repo_annual (repo_id, email, affiliation, year, added, removed, whitespace, files, patches, tool_source, tool_version, data_source)" @@ -757,14 +760,14 @@ def rebuild_unknown_affiliation_and_web_caches(session): "affiliation, " f"a.cmt_{report_attribution}_email," "a.repo_id, info.a, info.b, info.c" - )).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source) + )).bindparams(tool_source=facade_helper.tool_source,tool_version=facade_helper.tool_version,data_source=facade_helper.data_source) - session.execute_sql(cache_repos_by_year) +# session.execute_sql(cache_repos_by_year) # Reset cache flags reset_recache = s.sql.text("UPDATE repo_groups SET rg_recache = 0") - session.execute_sql(reset_recache) + execute_sql(reset_recache) - session.log_activity('Info','Caching unknown affiliations and web data for display (complete)') + facade_helper.log_activity('Info','Caching unknown affiliations and web data for display (complete)') diff --git a/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py b/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py index 64571bdd9b..874f338902 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py @@ -31,30 +31,32 @@ import pathlib import sqlalchemy as s from .utilitymethods import update_repo_log, get_absolute_repo_path +from sqlalchemy.orm.exc import NoResultFound from augur.application.db.models.augur_data import * from augur.application.db.models.augur_operations import CollectionStatus from augur.application.db.util import execute_session_query, convert_orm_list_to_dict_list +from augur.application.db.lib import execute_sql, get_repo_by_repo_git class GitCloneError(Exception): pass -def git_repo_initialize(session, repo_git): +def git_repo_initialize(facade_helper, session, repo_git): # Select any new git repos so we can set up their locations and git clone - session.update_status('Fetching non-cloned repos') - session.log_activity('Info', 'Fetching non-cloned repos') + facade_helper.update_status('Fetching non-cloned repos') + facade_helper.log_activity('Info', 'Fetching non-cloned repos') # Get data as a list of dicts - # new_repos = session.fetchall_data_from_sql_text(query)#list(cfg.cursor) + # new_repos = fetchall_data_from_sql_text(query)#list(cfg.cursor) row = Repo.get_by_repo_git(session, repo_git) if row: - session.log_activity( + facade_helper.log_activity( 'Info', f"Fetching repo with repo id: {row.repo_id}") - update_repo_log(session, row.repo_id, 'Cloning') + update_repo_log(logger, facade_helper, row.repo_id, 'Cloning') git = html.unescape(row.repo_git) @@ -62,28 +64,28 @@ def git_repo_initialize(session, repo_git): if git.find('://', 0) > 0: platform_org_git_url_section = git[git.find( '://', 0)+3:][:git[git.find('://', 0)+3:].rfind('/', 0)+1] - session.log_activity( + facade_helper.log_activity( 'Info', f"Repo Relative Path from facade05, from for row in new_repos, line 79: {platform_org_git_url_section}") - session.log_activity('Info', f"The git path used : {git}") + facade_helper.log_activity('Info', f"The git path used : {git}") else: platform_org_git_url_section = git[:git.rfind('/', 0)+1] - session.log_activity( + facade_helper.log_activity( 'Info', f"Repo Relative Path from facade05, line 80, reset at 86: {platform_org_git_url_section}") # Get the name of repo repo_name = git[git.rfind('/', 0)+1:] if repo_name.endswith('.git'): repo_name = repo_name[:repo_name.find('.git', 0)] - session.log_activity( + facade_helper.log_activity( 'Info', f"Repo Name from facade05, line 93: {repo_name}") path_identifier = f"{platform_org_git_url_section}{repo_name}".replace('/','-') # Get the full path to the directory where we'll clone the repo repo_path = ( - f"{session.repo_base_directory}{row.repo_id}-{path_identifier}") - session.log_activity( + f"{facade_helper.repo_base_directory}{row.repo_id}-{path_identifier}") + facade_helper.log_activity( 'Info', f"Repo Path from facade05, line 86: {repo_path}") @@ -91,21 +93,21 @@ def git_repo_initialize(session, repo_git): # query = s.sql.text("""SELECT NULL FROM repo WHERE CONCAT(repo_group_id,'/',repo_path,repo_name) = :repo_group_id # """).bindparams(repo_group_id=f"{row.repo_group_id}/{platform_org_git_url_section}{repo_name}") # - # result = session.fetchall_data_from_sql_text(query) + # result = fetchall_data_from_sql_text(query) query = s.sql.text("""UPDATE repo SET repo_path=:pathParam, repo_name=:nameParam WHERE repo_id=:idParam """).bindparams(pathParam=path_identifier, nameParam=repo_name, idParam=row.repo_id) - session.execute_sql(query) + execute_sql(query) # Check if there will be a storage path collision # If there is a collision, throw an error so that it updates the existing repo instead of trying # to reclone. if os.path.isdir(repo_path): # len(result): - session.log_activity( + facade_helper.log_activity( 'Verbose', f"Identical repo detected, storing {git} in {repo_name}") - session.logger.warning( + logger.warning( f"Identical repo found in facade directory! Repo git: {git}") statusQuery = session.query(CollectionStatus).filter( CollectionStatus.repo_id == row.repo_id) @@ -119,7 +121,7 @@ def git_repo_initialize(session, repo_git): repo_name=:nameParam WHERE repo_id=:idParam """).bindparams(pathParam=path_identifier, nameParam=repo_name, idParam=row.repo_id) - session.execute_sql(query) + execute_sql(query) return # Create the prerequisite directories @@ -128,23 +130,23 @@ def git_repo_initialize(session, repo_git): except Exception as e: print("COULD NOT CREATE REPO DIRECTORY") - update_repo_log(session, row.repo_id, 'Failed (mkdir)') - session.update_status(f"Failed (mkdir {repo_path})") - session.log_activity( + update_repo_log(logger, facade_helper, row.repo_id, 'Failed (mkdir)') + facade_helper.update_status(f"Failed (mkdir {repo_path})") + facade_helper.log_activity( 'Error', f"Could not create repo directory: {repo_path}") raise e - update_repo_log(session, row.repo_id, 'New (cloning)') + update_repo_log(logger, facade_helper, row.repo_id, 'New (cloning)') #Make sure newly cloned repo path is recorded in repo table query = s.sql.text("""UPDATE repo SET repo_path=:pathParam, repo_name=:nameParam WHERE repo_id=:idParam """).bindparams(pathParam=path_identifier, nameParam=repo_name, idParam=row.repo_id) - session.execute_sql(query) + execute_sql(query) - session.log_activity('Verbose', f"Cloning: {git}") + facade_helper.log_activity('Verbose', f"Cloning: {git}") cmd = f"git -C {repo_path} clone '{git}' {repo_name}" return_code = subprocess.Popen([cmd], shell=True).wait() @@ -153,18 +155,18 @@ def git_repo_initialize(session, repo_git): # If cloning succeeded, repo is ready for analysis # Mark the entire project for an update, so that under normal # circumstances caches are rebuilt only once per waiting period. - update_repo_log(session, row.repo_id, 'Up-to-date') - session.log_activity('Info', f"Cloned {git}") + update_repo_log(logger, facade_helper, row.repo_id, 'Up-to-date') + facade_helper.log_activity('Info', f"Cloned {git}") else: # If cloning failed, log it and set the status back to new - update_repo_log(session, row.repo_id, f"Failed ({return_code})") + update_repo_log(logger, facade_helper, row.repo_id, f"Failed ({return_code})") - session.log_activity('Error', f"Could not clone {git}") + facade_helper.log_activity('Error', f"Could not clone {git}") raise GitCloneError(f"Could not clone {git}") - session.log_activity('Info', f"Fetching new repos (complete)") + facade_helper.log_activity('Info', f"Fetching new repos (complete)") # Deprecated functionality. No longer used @@ -185,8 +187,8 @@ def check_for_repo_updates(session, repo_git): AND repo_status != 'Analyze' AND repo_status != 'Empty' AND repo_git = :value""").bindparams(value=repo_git) - # repos = session.fetchall_data_from_sql_text(get_initialized_repos)#list(cfg.cursor) - repo = session.execute_sql(get_initialized_repos).fetchone() + # repos = fetchall_data_from_sql_text(get_initialized_repos)#list(cfg.cursor) + repo = execute_sql(get_initialized_repos).fetchone() if repo: @@ -196,7 +198,7 @@ def check_for_repo_updates(session, repo_git): repos_id=:repo_id AND status='Up-to-date' AND date >= CURRENT_TIMESTAMP(6) - INTERVAL :update_freq HOUR """).bindparams(repo_id=repo['repo_id'], update_freq=update_frequency[0]) - result = session.fetchall_data_from_sql_text(get_last_update) + result = fetchall_data_from_sql_text(get_last_update) # If the repo has not been updated within the waiting period, mark it. # Also mark any other repos in the project, so we only recache the # project once per waiting period. @@ -213,7 +215,7 @@ def check_for_repo_updates(session, repo_git): # "SET status='Update' WHERE " # "r.id=%s and r.status != 'Empty'") - session.execute_sql(mark_repo) + execute_sql(mark_repo) # Mark the entire project for an update, so that under normal # circumstances caches are rebuilt only once per waiting period. @@ -250,7 +252,7 @@ def force_repo_updates(session, repo_git): get_repo_ids = s.sql.text("""UPDATE repo SET repo_status='Update' WHERE repo_status NOT LIKE 'New%' AND repo_status!='Delete' AND repo_status !='Empty' AND repo_git=:value""").bindparams(value=repo_git) - session.execute_sql(get_repo_ids) + execute_sql(get_repo_ids) session.log_activity('Info', 'Forcing repos to update (complete)') @@ -268,38 +270,35 @@ def force_repo_analysis(session, repo_git): NOT LIKE 'New%' AND repo_status!='Delete' AND repo_status != 'Empty' AND repo_git=:repo_git_ident""").bindparams(repo_git_ident=repo_git) - session.execute_sql(set_to_analyze) + execute_sql(set_to_analyze) session.log_activity('Info', 'Forcing repos to be analyzed (complete)') -def git_repo_updates(session, repo_git): +def git_repo_updates(facade_helper, repo_git): # Update existing repos - session.update_status('Updating repos') - session.log_activity('Info', 'Updating existing repos') + facade_helper.update_status('Updating repos') + facade_helper.log_activity('Info', 'Updating existing repos') # query = s.sql.text("""SELECT repo_id,repo_group_id,repo_git,repo_name,repo_path FROM repo WHERE # repo_status='Update'""") - query = session.query(Repo).filter( - Repo.repo_git == repo_git) - result = execute_session_query(query, 'all') try: - # session.fetchall_data_from_sql_text(query)#list(cfg.cursor) - row = convert_orm_list_to_dict_list(result)[0] - except IndexError: + repo = get_repo_by_repo_git(repo_git) + except NoResultFound: raise Exception( f"Repo git: {repo_git} does not exist or the status is not 'Update'") - if row["repo_path"] is None or row["repo_name"] is None: + + if repo.repo_path is None or repo.repo_name is None: raise Exception( - f"The repo path or repo name is NULL for repo_id: {row['repo_id']}") + f"The repo path or repo name is NULL for repo_id: {repo.repo_id}") - session.log_activity( - 'Verbose', f"Attempting to update {row['repo_git']}") # ['git']) - update_repo_log(session, row['repo_id'], 'Updating') # ['id'],'Updating') + facade_helper.log_activity( + 'Verbose', f"Attempting to update {repo.repo_git}") # ['git']) + update_repo_log(logger, facade_helper, repo.repo_id, 'Updating') # ['id'],'Updating') attempt = 0 @@ -310,7 +309,7 @@ def git_repo_updates(session, repo_git): # default_branch = '' absolute_path = get_absolute_repo_path( - session.repo_base_directory, row["repo_id"], row['repo_path'],row['repo_name']) + facade_helper.repo_base_directory, repo.repo_id, repo.repo_path, repo.repo_name) while attempt < 2: @@ -321,7 +320,7 @@ def git_repo_updates(session, repo_git): return_code_remote = subprocess.Popen( [firstpull], shell=True).wait() - session.log_activity('Verbose', 'Got to here. 1.') + facade_helper.log_activity('Verbose', 'Got to here. 1.') if return_code_remote == 0: @@ -343,26 +342,26 @@ def git_repo_updates(session, repo_git): remotedefault = remotedefault.decode() - session.log_activity( + facade_helper.log_activity( 'Verbose', f'remote default getting checked out is: {remotedefault}.') getremotedefault = ( f"git -C {absolute_path} checkout {remotedefault}") - session.log_activity( + facade_helper.log_activity( 'Verbose', f"get remote default command is: \n \n {getremotedefault} \n \n ") return_code_remote_default_again = subprocess.Popen( [getremotedefault], shell=True).wait() if return_code_remote_default_again == 0: - session.log_activity('Verbose', "local checkout worked.") + facade_helper.log_activity('Verbose', "local checkout worked.") cmd = (f"git -C {absolute_path} pull") return_code = subprocess.Popen([cmd], shell=True).wait() except Exception as e: - session.log_activity( + facade_helper.log_activity( 'Verbose', f'Error code on branch change is {e}.') pass @@ -378,8 +377,8 @@ def git_repo_updates(session, repo_git): break elif attempt == 0: - session.log_activity( - 'Verbose', f"git pull failed, attempting reset and clean for {row['repo_git']}") + facade_helper.log_activity( + 'Verbose', f"git pull failed, attempting reset and clean for {repo.repo_git}") # remotedefault = 'main' @@ -412,7 +411,7 @@ def git_repo_updates(session, repo_git): return_message_getremotedefault = subprocess.Popen( [getremotedefault], stdout=subprocess.PIPE, shell=True).communicate()[0] - session.log_activity( + facade_helper.log_activity( 'Verbose', f'get remote default result: {return_message_getremotedefault}') getcurrentbranch = (f"git -C {absolute_path} branch") @@ -425,7 +424,7 @@ def git_repo_updates(session, repo_git): localdefault = localdefault.decode() - session.log_activity( + facade_helper.log_activity( 'Verbose', f'remote default is: {remotedefault}, and localdefault is {localdefault}.') cmd_checkout_default = ( @@ -448,7 +447,7 @@ def git_repo_updates(session, repo_git): except Exception as e: - session.log_activity('Verbose', f'Second pass failed: {e}.') + facade_helper.log_activity('Verbose', f'Second pass failed: {e}.') pass cmdpull2 = (f"git -C {absolute_path} pull") @@ -462,12 +461,12 @@ def git_repo_updates(session, repo_git): if return_code == 0: - update_repo_log(session, row['repo_id'], 'Up-to-date') - session.log_activity('Verbose', f"Updated {row['repo_git']}") + update_repo_log(logger, facade_helper, repo.repo_id, 'Up-to-date') + facade_helper.log_activity('Verbose', f"Updated {repo.repo_git}") else: - update_repo_log(session, row['repo_id'], f"Failed ({return_code})") - session.log_activity('Error', f"Could not update {row['repo_git']}") + update_repo_log(logger, facade_helper, repo.repo_id, f"Failed ({return_code})") + facade_helper.log_activity('Error', f"Could not update {repo.repo_git}") - session.log_activity('Info', 'Updating existing repos (complete)') + facade_helper.log_activity('Info', 'Updating existing repos (complete)') diff --git a/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py b/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py index 848cb38917..caae6c02ba 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py @@ -26,55 +26,47 @@ # repos. It also rebuilds analysis data, checks any changed affiliations and # aliases, and caches data for display. import subprocess -from subprocess import check_output +from subprocess import check_output, CalledProcessError import os import sqlalchemy as s -from sqlalchemy.exc import DataError from augur.application.db.models import * -from .config import FacadeSession as FacadeSession +from .config import FacadeHelper as FacadeHelper from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps +from augur.application.db.lib import execute_sql, fetchall_data_from_sql_text, remove_working_commits_by_repo_id_and_hashes, remove_commits_by_repo_id_and_hashes, get_repo_by_repo_git, get_session +from augur.application.db.util import execute_session_query #from augur.tasks.git.util.facade_worker.facade -def update_repo_log(session, repos_id,status): +def update_repo_log(logger, facade_helper, repos_id,status): # Log a repo's fetch status - session.log_activity("Info",f"{status} {repos_id}") + facade_helper.log_activity("Info",f"{status} {repos_id}") #log_message = ("INSERT INTO repos_fetch_log (repos_id,status) " # "VALUES (%s,%s)") try: log_message = s.sql.text("""INSERT INTO repos_fetch_log (repos_id,status) VALUES (:repo_id,:repo_status)""").bindparams(repo_id=repos_id,repo_status=status) - #session.insert_data(data,t_repos_fetch_log,['repos_id','status']) - session.execute_sql(log_message) + #bulk_insert_dicts(data,t_repos_fetch_log,['repos_id','status']) + execute_sql(log_message) except Exception as e: - session.logger.error(f"Ran into error in update_repo_log: {e}") + logger.error(f"Ran into error in update_repo_log: {e}") pass -def trim_commits(session, repo_id,commits): +def trim_commits(facade_helper, repo_id,commits): # Quickly remove a given commit if len(commits): - remove_commit = s.sql.text("""DELETE FROM commits - WHERE repo_id=:repo_id - AND cmt_commit_hash IN :hashes""").bindparams(repo_id=repo_id,hashes=tuple(commits)) - - - session.execute_sql(remove_commit) + remove_commits_by_repo_id_and_hashes(repo_id, commits) # Remove the working commit. - remove_commit = s.sql.text("""DELETE FROM working_commits - WHERE repos_id = :repo_id AND - working_commit IN :hashes""").bindparams(repo_id=repo_id,hashes=tuple(commits)) - - session.execute_sql(remove_commit) + remove_working_commits_by_repo_id_and_hashes(repo_id, commits) for commit in commits: - session.log_activity('Debug',f"Trimmed commit: {commit}") - session.log_activity('Debug',f"Removed working commit: {commit}") + facade_helper.log_activity('Debug',f"Trimmed commit: {commit}") + facade_helper.log_activity('Debug',f"Removed working commit: {commit}") -def store_working_author(session, email): +def store_working_author(facade_helper, email): # Store the working author during affiliation discovery, in case it is # interrupted and needs to be trimmed. @@ -84,11 +76,11 @@ def store_working_author(session, email): WHERE setting = 'working_author' """).bindparams(email=email) - session.execute_sql(store) + execute_sql(store) - session.log_activity('Debug',f"Stored working author: {email}") + facade_helper.log_activity('Debug',f"Stored working author: {email}") -def trim_author(session, email): +def trim_author(facade_helper, email): # Remove the affiliations associated with an email. Used when an analysis is # interrupted during affiliation layering, and the data will be corrupt. @@ -97,21 +89,17 @@ def trim_author(session, email): SET cmt_author_affiliation = NULL WHERE cmt_author_email = :email """).bindparams(email=email) - - - - session.execute_sql(trim) + execute_sql(trim) trim = s.sql.text("""UPDATE commits SET cmt_committer_affiliation = NULL WHERE cmt_committer_email = :email """).bindparams(email=email) + execute_sql(trim) - session.execute_sql(trim) - - store_working_author(session, 'done') + store_working_author(facade_helper, 'done') - session.log_activity('Debug',f"Trimmed working author: {email}") + facade_helper.log_activity('Debug',f"Trimmed working author: {email}") def get_absolute_repo_path(repo_base_dir, repo_id, repo_path,repo_name): @@ -134,12 +122,12 @@ def get_parent_commits_set(absolute_repo_path, start_date): return parent_commits -def get_existing_commits_set(session, repo_id): +def get_existing_commits_set(repo_id): find_existing = s.sql.text("""SELECT DISTINCT cmt_commit_hash FROM commits WHERE repo_id=:repo_id """).bindparams(repo_id=repo_id) - existing_commits = [commit['cmt_commit_hash'] for commit in session.fetchall_data_from_sql_text(find_existing)] + existing_commits = [commit['cmt_commit_hash'] for commit in fetchall_data_from_sql_text(find_existing)] return set(existing_commits) @@ -148,15 +136,15 @@ def count_branches(git_dir): branches_dir = os.path.join(git_dir, 'refs', 'heads') return sum(1 for _ in os.scandir(branches_dir)) -def get_repo_commit_count(session, repo_git): - - repo = Repo.get_by_repo_git(session, repo_git) +def get_repo_commit_count(logger, facade_helper, repo_git): - absolute_path = get_absolute_repo_path(session.repo_base_directory, repo.repo_id, repo.repo_path,repo.repo_name) + repo = get_repo_by_repo_git(repo_git) + + absolute_path = get_absolute_repo_path(facade_helper.repo_base_directory, repo.repo_id, repo.repo_path,repo.repo_name) repo_loc = (f"{absolute_path}/.git") - session.logger.debug(f"loc: {repo_loc}") - session.logger.debug(f"path: {repo.repo_path}") + logger.debug(f"loc: {repo_loc}") + logger.debug(f"path: {repo.repo_path}") # Check if the .git directory exists if not os.path.exists(repo_loc): @@ -166,82 +154,60 @@ def get_repo_commit_count(session, repo_git): if count_branches(repo_loc) == 0: return 0 - check_commit_count_cmd = check_output(["git", "--git-dir", repo_loc, "rev-list", "--count", "HEAD"]) + try: + check_commit_count_cmd = check_output( + ["git", "--git-dir", repo_loc, "rev-list", "--count", "HEAD"], + stderr=subprocess.PIPE) + except CalledProcessError as e: + logger.error(f"Ran into {e}: {e.output} {e.stderr} \n With return code {e.returncode}") + raise e + + commit_count = int(check_commit_count_cmd) return commit_count -def get_facade_weight_time_factor(session,repo_git): - repo = Repo.get_by_repo_git(session, repo_git) - - try: - status = repo.collection_status[0] - time_factor = calculate_date_weight_from_timestamps(repo.repo_added, status.facade_data_last_collected) - except IndexError: - time_factor = calculate_date_weight_from_timestamps(repo.repo_added, None) - - #Adjust for commits. - time_factor *= 1.2 +def get_facade_weight_time_factor(repo_git): + + with get_session() as session: + + query = session.query(Repo).filter(Repo.repo_git == repo_git) + repo = execute_session_query(query, 'one') + + try: + status = repo.collection_status[0] + time_factor = calculate_date_weight_from_timestamps(repo.repo_added, status.facade_data_last_collected) + except IndexError: + time_factor = calculate_date_weight_from_timestamps(repo.repo_added, None) + + #Adjust for commits. + time_factor *= 1.2 - return time_factor + return time_factor -def get_facade_weight_with_commit_count(session, repo_git, commit_count): - return commit_count - get_facade_weight_time_factor(session, repo_git) +def get_facade_weight_with_commit_count(repo_git, commit_count): + return commit_count - get_facade_weight_time_factor(repo_git) -def get_repo_weight_by_commit(logger,repo_git): - with FacadeSession(logger) as session: - return get_repo_commit_count(session, repo_git) - get_facade_weight_time_factor(session, repo_git) +def get_repo_weight_by_commit(logger, repo_git): + facade_helper = FacadeHelper(logger) + return get_repo_commit_count(logger, facade_helper, repo_git) - get_facade_weight_time_factor(repo_git) -def update_facade_scheduling_fields(session, repo_git, weight, commit_count): - repo = Repo.get_by_repo_git(session, repo_git) +def update_facade_scheduling_fields(repo_git, weight, commit_count): - update_query = ( - s.update(CollectionStatus) - .where(CollectionStatus.repo_id == repo.repo_id) - .values(facade_weight=weight,commit_sum=commit_count) - ) + repo = get_repo_by_repo_git(repo_git) - session.execute(update_query) - session.commit() + with get_session() as session: -def facade_bulk_insert_commits(session,records): + update_query = ( + s.update(CollectionStatus) + .where(CollectionStatus.repo_id == repo.repo_id) + .values(facade_weight=weight,commit_sum=commit_count) + ) - try: - session.execute( - s.insert(Commit), - records, - ) + session.execute(update_query) session.commit() - except Exception as e: - - if len(records) > 1: - session.logger.error(f"Ran into issue when trying to insert commits \n Error: {e}") - - #split list into halves and retry insert until we isolate offending record - firsthalfRecords = records[:len(records)//2] - secondhalfRecords = records[len(records)//2:] - - facade_bulk_insert_commits(session,firsthalfRecords) - facade_bulk_insert_commits(session,secondhalfRecords) - elif len(records) == 1 and isinstance(e,DataError) and "time zone displacement" in f"{e}": - commit_record = records[0] - #replace incomprehensible dates with epoch. - #2021-10-11 11:57:46 -0500 - placeholder_date = "1970-01-01 00:00:15 -0500" - - #Check for improper utc timezone offset - #UTC timezone offset should be betwen -14:00 and +14:00 - - commit_record['author_timestamp'] = placeholder_date - commit_record['committer_timestamp'] = placeholder_date - - session.execute( - s.insert(Commit), - [commit_record], - ) - session.commit() - else: - raise e + + diff --git a/augur/tasks/git/util/facade_worker/setup.py b/augur/tasks/git/util/facade_worker/setup.py deleted file mode 100644 index 298baff49d..0000000000 --- a/augur/tasks/git/util/facade_worker/setup.py +++ /dev/null @@ -1,47 +0,0 @@ -#SPDX-License-Identifier: MIT -import io -import os -import re - -from setuptools import find_packages -from setuptools import setup - -def read(filename): - filename = os.path.join(os.path.dirname(__file__), filename) - text_type = type(u"") - with io.open(filename, mode="r", encoding='utf-8') as fd: - return re.sub(text_type(r':[a-z]+:`~?(.*?)`'), text_type(r'``\1``'), fd.read()) - -setup( - name="facade_worker", - version="1.3.0", - url="https://github.com/chaoss/augur", - license='MIT', - author="Augurlabs", - author_email="s@goggins.com", - description="Augur Worker that parses and collects git log data", - packages=find_packages(exclude=('tests',)), - install_requires=[ - 'Flask==2.0.2', - 'Flask-Cors==3.0.10', - 'Flask-Login==0.5.0', - 'Flask-WTF==1.0.0', - 'requests==2.28.0', - 'psycopg2-binary==2.9.3', - 'click==8.0.3', - 'XlsxWriter==1.3.7' - ], - entry_points={ - #TODO: change to celery - 'console_scripts': [ - 'facade_worker_start=facade_worker.runtime:main', - ], - }, - classifiers=[ - 'Development Status :: 2 - Pre-Alpha', - 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.7', - ] -) diff --git a/augur/tasks/github/__init__.py b/augur/tasks/github/__init__.py index 29823eafe5..63d68da41b 100644 --- a/augur/tasks/github/__init__.py +++ b/augur/tasks/github/__init__.py @@ -1,7 +1,7 @@ -from augur.tasks.github.contributors.tasks import * -from augur.tasks.github.events.tasks import * -from augur.tasks.github.issues.tasks import * -from augur.tasks.github.messages.tasks import * +from augur.tasks.github.contributors import * +from augur.tasks.github.events import * +from augur.tasks.github.issues import * +from augur.tasks.github.messages import * from augur.tasks.github.pull_requests.tasks import * from augur.tasks.github.repo_info.tasks import * from augur.tasks.github.releases.tasks import * diff --git a/augur/tasks/github/augur-notes.code-workspace b/augur/tasks/github/augur-notes.code-workspace new file mode 100644 index 0000000000..f99c46d484 --- /dev/null +++ b/augur/tasks/github/augur-notes.code-workspace @@ -0,0 +1,14 @@ +{ + "folders": [ + { + "path": "../../../../../augurlabs/augur-notes" + }, + { + "path": "../../.." + }, + { + "path": "../../../../../sociallycompute/project2025" + } + ], + "settings": {} +} \ No newline at end of file diff --git a/augur/tasks/github/contributors/tasks.py b/augur/tasks/github/contributors.py similarity index 53% rename from augur/tasks/github/contributors/tasks.py rename to augur/tasks/github/contributors.py index 882725d205..20f796647e 100644 --- a/augur/tasks/github/contributors/tasks.py +++ b/augur/tasks/github/contributors.py @@ -1,13 +1,16 @@ import time import logging +import traceback from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.tasks.github.util.github_paginator import hit_api -from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.tasks.github.facade_github.tasks import * -from augur.application.db.models import Contributor, Repo +from augur.application.db.models import Contributor from augur.application.db.util import execute_session_query +from augur.application.db.lib import bulk_insert_dicts, get_session, batch_insert_contributors +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth + @celery.task @@ -19,48 +22,48 @@ def process_contributors(): tool_version = "2.0" data_source = "Github API" - with GithubTaskManifest(logger) as manifest: + key_auth = GithubRandomKeyAuth(logger) - augur_db = manifest.augur_db + with get_session() as session: - query = augur_db.session.query(Contributor).filter(Contributor.data_source == data_source, Contributor.cntrb_created_at is None, Contributor.cntrb_last_used is None) + query = session.query(Contributor).filter(Contributor.data_source == data_source, Contributor.cntrb_created_at is None, Contributor.cntrb_last_used is None) contributors = execute_session_query(query, 'all') - contributors_len = len(contributors) + contributors_len = len(contributors) - if contributors_len == 0: - logger.info("No contributors to enrich...returning...") - return + if contributors_len == 0: + logger.info("No contributors to enrich...returning...") + return - print(f"Length of contributors to enrich: {contributors_len}") - enriched_contributors = [] - for index, contributor in enumerate(contributors): + print(f"Length of contributors to enrich: {contributors_len}") + enriched_contributors = [] + for index, contributor in enumerate(contributors): - logger.info(f"Contributor {index + 1} of {contributors_len}") + logger.info(f"Contributor {index + 1} of {contributors_len}") - contributor_dict = contributor.__dict__ + contributor_dict = contributor.__dict__ - del contributor_dict["_sa_instance_state"] + del contributor_dict["_sa_instance_state"] - url = f"https://api.github.com/users/{contributor_dict['cntrb_login']}" + url = f"https://api.github.com/users/{contributor_dict['cntrb_login']}" - data = retrieve_dict_data(url, manifest.key_auth, logger) + data = retrieve_dict_data(url, key_auth, logger) - if data is None: - print(f"Unable to get contributor data for: {contributor_dict['cntrb_login']}") - continue + if data is None: + print(f"Unable to get contributor data for: {contributor_dict['cntrb_login']}") + continue - new_contributor_data = { - "cntrb_created_at": data["created_at"], - "cntrb_last_used": data["updated_at"] - } + new_contributor_data = { + "cntrb_created_at": data["created_at"], + "cntrb_last_used": data["updated_at"] + } - contributor_dict.update(new_contributor_data) + contributor_dict.update(new_contributor_data) - enriched_contributors.append(contributor_dict) + enriched_contributors.append(contributor_dict) - logger.info(f"Enriching {len(enriched_contributors)} contributors") - augur_db.insert_data(enriched_contributors, Contributor, ["cntrb_id"]) + logger.info(f"Enriching {len(enriched_contributors)} contributors") + batch_insert_contributors(logger, enriched_contributors) @@ -109,14 +112,10 @@ def grab_comitters(self, repo_git,platform="github"): engine = self.app.engine logger = logging.getLogger(grab_comitters.__name__) - with DatabaseSession(logger,engine) as session: - - repo = session.query(Repo).filter(Repo.repo_git == repo_git).one() - repo_id = repo.repo_id try: - with GithubTaskManifest(logger) as manifest: - grab_committer_list(manifest, repo_id,platform) + key_auth = GithubRandomKeyAuth(logger) + grab_committer_list(logger, key_auth, repo_git, platform) except Exception as e: logger.error(f"Could not grab committers from github endpoint!\n Reason: {e} \n Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}") diff --git a/augur/tasks/github/contributors/core.py b/augur/tasks/github/contributors/core.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/augur/tasks/github/detect_move/core.py b/augur/tasks/github/detect_move/core.py index 2bf96ffa1f..b302a70a06 100644 --- a/augur/tasks/github/detect_move/core.py +++ b/augur/tasks/github/detect_move/core.py @@ -6,10 +6,11 @@ from datetime import datetime from augur.tasks.util.collection_state import CollectionState from augur.application.db.util import execute_session_query +from augur.application.db.lib import bulk_insert_dicts -def update_repo_with_dict(repo,new_dict,logger,db): +def update_repo_with_dict(repo,new_dict,logger): """ Update a repository record in the database using a dictionary tagged with the appropriate table fields @@ -25,7 +26,7 @@ def update_repo_with_dict(repo,new_dict,logger,db): del to_insert['_sa_instance_state'] to_insert.update(new_dict) - result = db.insert_data(to_insert, Repo, ['repo_id']) + result = bulk_insert_dicts(logger, to_insert, Repo, ['repo_id']) url = to_insert['repo_git'] logger.info(f"Updated repo for {url}\n") @@ -43,7 +44,7 @@ def extract_owner_and_repo_from_endpoint(key_auth, url, logger): return splits[0], splits[-1] -def ping_github_for_repo_move(augur_db, key_auth, repo, logger,collection_hook='core'): +def ping_github_for_repo_move(session, key_auth, repo, logger,collection_hook='core'): owner, name = get_owner_repo(repo.repo_git) url = f"https://api.github.com/repos/{owner}/{name}" @@ -76,7 +77,7 @@ def ping_github_for_repo_move(augur_db, key_auth, repo, logger,collection_hook=' 'description': f"(Originally hosted at {url}) {old_description}" } - update_repo_with_dict(repo, repo_update_dict, logger,augur_db) + update_repo_with_dict(repo, repo_update_dict, logger) raise Exception("ERROR: Repo has moved! Resetting Collection!") @@ -90,9 +91,9 @@ def ping_github_for_repo_move(augur_db, key_auth, repo, logger,collection_hook=' 'data_collection_date': datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ') } - update_repo_with_dict(repo, repo_update_dict, logger, augur_db) + update_repo_with_dict(repo, repo_update_dict, logger) - statusQuery = augur_db.session.query(CollectionStatus).filter(CollectionStatus.repo_id == repo.repo_id) + statusQuery = session.query(CollectionStatus).filter(CollectionStatus.repo_id == repo.repo_id) collectionRecord = execute_session_query(statusQuery,'one') @@ -113,8 +114,8 @@ def ping_github_for_repo_move(augur_db, key_auth, repo, logger,collection_hook=' collectionRecord.ml_data_last_collected = datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ') - augur_db.session.commit() - raise Exception("ERROR: Repo has moved! Resetting Collection!") + session.commit() + raise Exception("ERROR: Repo has moved, and there is no redirection! 404 returned, not 301. Resetting Collection!") if attempts >= 10: diff --git a/augur/tasks/github/detect_move/tasks.py b/augur/tasks/github/detect_move/tasks.py index c9da0d3ca2..f542d89289 100644 --- a/augur/tasks/github/detect_move/tasks.py +++ b/augur/tasks/github/detect_move/tasks.py @@ -1,11 +1,10 @@ import logging -from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.tasks.github.detect_move.core import * from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask, AugurSecondaryRepoCollectionTask -from augur.application.db.util import execute_session_query - +from augur.application.db.lib import get_repo_by_repo_git, get_session +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth @celery.task(base=AugurCoreRepoCollectionTask) @@ -14,14 +13,18 @@ def detect_github_repo_move_core(repo_git : str) -> None: logger = logging.getLogger(detect_github_repo_move_core.__name__) logger.info(f"Starting repo_move operation with {repo_git}") - with GithubTaskManifest(logger) as manifest: - augur_db = manifest.augur_db + + repo = get_repo_by_repo_git(repo_git) + + logger.info(f"Pinging repo: {repo_git}") + + key_auth = GithubRandomKeyAuth(logger) + + with get_session() as session: + #Ping each repo with the given repo_git to make sure #that they are still in place. - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo = execute_session_query(query, 'one') - logger.info(f"Pinging repo: {repo_git}") - ping_github_for_repo_move(augur_db, manifest.key_auth, repo, logger) + ping_github_for_repo_move(session, key_auth, repo, logger) @celery.task(base=AugurSecondaryRepoCollectionTask) @@ -30,11 +33,15 @@ def detect_github_repo_move_secondary(repo_git : str) -> None: logger = logging.getLogger(detect_github_repo_move_secondary.__name__) logger.info(f"Starting repo_move operation with {repo_git}") - with GithubTaskManifest(logger) as manifest: - augur_db = manifest.augur_db + + repo = get_repo_by_repo_git(repo_git) + + logger.info(f"Pinging repo: {repo_git}") + + key_auth = GithubRandomKeyAuth(logger) + + with get_session() as session: + #Ping each repo with the given repo_git to make sure #that they are still in place. - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo = execute_session_query(query, 'one') - logger.info(f"Pinging repo: {repo_git}") - ping_github_for_repo_move(augur_db, manifest.key_auth, repo, logger,collection_hook='secondary') \ No newline at end of file + ping_github_for_repo_move(session, key_auth, repo, logger,collection_hook='secondary') \ No newline at end of file diff --git a/augur/tasks/github/events.py b/augur/tasks/github/events.py new file mode 100644 index 0000000000..08efb35d92 --- /dev/null +++ b/augur/tasks/github/events.py @@ -0,0 +1,400 @@ +import logging +import traceback +import sqlalchemy as s +from sqlalchemy.sql import text +from abc import ABC, abstractmethod +from datetime import datetime, timedelta, timezone + +from augur.tasks.init.celery_app import celery_app as celery +from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask +from augur.application.db.data_parse import * +from augur.tasks.github.util.github_data_access import GithubDataAccess, UrlNotFoundException +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth +from augur.tasks.github.util.github_task_session import GithubTaskManifest +from augur.tasks.github.util.util import get_owner_repo +from augur.tasks.util.worker_util import remove_duplicate_dicts +from augur.application.db.models import PullRequestEvent, IssueEvent, Contributor, Repo +from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts, get_issues_by_repo_id, get_pull_requests_by_repo_id, update_issue_closed_cntrbs_by_repo_id, get_session, get_engine, get_core_data_last_collected, batch_insert_contributors + + +platform_id = 1 + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_events(repo_git: str, full_collection: bool): + + logger = logging.getLogger(collect_events.__name__) + + owner, repo = get_owner_repo(repo_git) + + logger.debug(f"Collecting Github events for {owner}/{repo}") + + if full_collection: + core_data_last_collected = None + else: + repo_id = get_repo_by_repo_git(repo_git).repo_id + + # subtract 2 days to ensure all data is collected + core_data_last_collected = (get_core_data_last_collected(repo_id) - timedelta(days=2)).replace(tzinfo=timezone.utc) + + key_auth = GithubRandomKeyAuth(logger) + + if bulk_events_collection_endpoint_contains_all_data(key_auth, logger, owner, repo): + collection_strategy = BulkGithubEventCollection(logger) + else: + collection_strategy = ThoroughGithubEventCollection(logger) + + collection_strategy.collect(repo_git, key_auth, core_data_last_collected) + +def bulk_events_collection_endpoint_contains_all_data(key_auth, logger, owner, repo): + + url = f"https://api.github.com/repos/{owner}/{repo}/issues/events?per_page=100" + + github_data_access = GithubDataAccess(key_auth, logger) + + page_count = github_data_access.get_resource_page_count(url) + + if page_count > 300: + raise Warning(f"Page Count is {page_count}. Either github raised the paginator page limit for things like events and messages, or is_pagination_limited_by_max_github_pages is being used on a resource that does not have a page limit. Url: {url}") + + return page_count != 300 + +class NotMappableException(Exception): + pass + +class GithubEventCollection(ABC): + + def __init__(self, logger): + self._logger = logger + self._tool_source = "Github events task" + self._tool_version = "2.0" + self._data_source = "Github API" + + @abstractmethod + def collect(self, repo_git, key_auth, since): + pass + + def _insert_issue_events(self, events): + issue_event_natural_keys = ["issue_id", "issue_event_src_id"] + bulk_insert_dicts(self._logger, events, IssueEvent, issue_event_natural_keys) + + def _insert_pr_events(self, events): + pr_event_natural_keys = ["repo_id", "issue_event_src_id"] + bulk_insert_dicts(self._logger, events, PullRequestEvent, pr_event_natural_keys) + + def _insert_contributors(self, contributors): + batch_insert_contributors(self._logger, contributors) + + def _process_github_event_contributors(self, event): + + if event["actor"]: + + event_cntrb = extract_needed_contributor_data(event["actor"], self._tool_source, self._tool_version, self._data_source) + event["cntrb_id"] = event_cntrb["cntrb_id"] + + else: + event["cntrb_id"] = None + return event, None + + return event, event_cntrb + +class BulkGithubEventCollection(GithubEventCollection): + + def __init__(self, logger): + + self.task_name = f"Bulk Github Event task" + self.repo_identifier = "" + + + super().__init__(logger) + + def collect(self, repo_git, key_auth, since): + + repo_obj = get_repo_by_repo_git(repo_git) + repo_id = repo_obj.repo_id + + owner, repo = get_owner_repo(repo_git) + self.repo_identifier = f"{owner}/{repo}" + + events = [] + for event in self._collect_events(repo_git, key_auth, since): + events.append(event) + + # making this a decent size since process_events retrieves all the issues and prs each time + if len(events) >= 500: + self._process_events(events, repo_id) + events.clear() + + if events: + self._process_events(events, repo_id) + + def _collect_events(self, repo_git: str, key_auth, since): + + owner, repo = get_owner_repo(repo_git) + + url = f"https://api.github.com/repos/{owner}/{repo}/issues/events" + + github_data_access = GithubDataAccess(key_auth, self._logger) + + for event in github_data_access.paginate_resource(url): + + yield event + + # return if last event on the page was updated before the since date + if since and datetime.fromisoformat(event["created_at"].replace("Z", "+00:00")).replace(tzinfo=timezone.utc) < since: + return + + def _process_events(self, events, repo_id): + + issue_events = [] + pr_events = [] + not_mappable_events = [] + for event in events: + + try: + if self._is_pr_event(event): + pr_events.append(event) + else: + issue_events.append(event) + except NotMappableException: + not_mappable_events.append(event) + + if not_mappable_events: + self._logger.warning(f"{self.repo_identifier} - {self.task_name}: Unable to map these github events to an issue or pr: {not_mappable_events}") + + self._process_issue_events(issue_events, repo_id) + self._process_pr_events(pr_events, repo_id) + + update_issue_closed_cntrbs_by_repo_id(repo_id) + + def _process_issue_events(self, issue_events, repo_id): + + issue_event_dicts = [] + contributors = [] + + + issue_url_to_id_map = self._get_map_from_issue_url_to_id(repo_id) + + for event in issue_events: + + event, contributor = self._process_github_event_contributors(event) + + issue_url = event["issue"]["url"] + + try: + issue_id = issue_url_to_id_map[issue_url] + except KeyError: + self._logger.warning(f"{self.repo_identifier} - {self.task_name}: Could not find related issue. We were searching for: {issue_url}") + continue + + issue_event_dicts.append( + extract_issue_event_data(event, issue_id, platform_id, repo_id, + self._tool_source, self._tool_version, self._data_source) + ) + + if contributor: + contributors.append(contributor) + + contributors = remove_duplicate_dicts(contributors) + + self._insert_contributors(contributors) + + self._insert_issue_events(issue_event_dicts) + + def _process_pr_events(self, pr_events, repo_id): + + pr_event_dicts = [] + contributors = [] + + pr_url_to_id_map = self._get_map_from_pr_url_to_id(repo_id) + + for event in pr_events: + + event, contributor = self._process_github_event_contributors(event) + + pr_url = event["issue"]["pull_request"]["url"] + + try: + pull_request_id = pr_url_to_id_map[pr_url] + except KeyError: + + self._logger.warning(f"{self.repo_identifier} - {self.task_name}: Could not find related pr. We were searching for: {pr_url}") + continue + + pr_event_dicts.append( + extract_pr_event_data(event, pull_request_id, int(event['issue']["id"]), platform_id, repo_id, + self._tool_source, self._tool_version, self._data_source) + ) + + if contributor: + contributors.append(contributor) + + contributors = remove_duplicate_dicts(contributors) + + self._insert_contributors(contributors) + + self._insert_pr_events(pr_event_dicts) + + def _get_map_from_pr_url_to_id(self, repo_id): + + pr_url_to_id_map = {} + prs = get_pull_requests_by_repo_id(repo_id) + for pr in prs: + pr_url_to_id_map[pr.pr_url] = pr.pull_request_id + + return pr_url_to_id_map + + def _get_map_from_issue_url_to_id(self, repo_id): + + issue_url_to_id_map = {} + issues = get_issues_by_repo_id(repo_id) + for issue in issues: + issue_url_to_id_map[issue.issue_url] = issue.issue_id + + return issue_url_to_id_map + + def _is_pr_event(self, event): + + if event["issue"] is None: + raise NotMappableException("Not mappable to pr or issue") + + return event["issue"].get('pull_request', None) != None + +class ThoroughGithubEventCollection(GithubEventCollection): + + def __init__(self, logger): + super().__init__(logger) + + def collect(self, repo_git, key_auth, since): + + repo_obj = get_repo_by_repo_git(repo_git) + repo_id = repo_obj.repo_id + + owner, repo = get_owner_repo(repo_git) + self.repo_identifier = f"{owner}/{repo}" + + self._collect_and_process_issue_events(owner, repo, repo_id, key_auth, since) + self._collect_and_process_pr_events(owner, repo, repo_id, key_auth, since) + + def _collect_and_process_issue_events(self, owner, repo, repo_id, key_auth, since): + + engine = get_engine() + + with engine.connect() as connection: + + if since: + # TODO: Remove src id if it ends up not being needed + query = text(f""" + select issue_id as issue_id, gh_issue_number as issue_number, gh_issue_id as gh_src_id + from issues + where repo_id={repo_id} + and updated_at > timestamptz(timestamp '{since}') + order by created_at desc; + """) + else: + # TODO: Remove src id if it ends up not being needed + query = text(f""" + select issue_id as issue_id, gh_issue_number as issue_number, gh_issue_id as gh_src_id + from issues + where repo_id={repo_id} + order by created_at desc; + """) + + issue_result = connection.execute(query).fetchall() + + events = [] + contributors = [] + github_data_access = GithubDataAccess(key_auth, self._logger) + for db_issue in issue_result: + issue = db_issue._asdict() + + issue_number = issue["issue_number"] + + event_url = f"https://api.github.com/repos/{owner}/{repo}/issues/{issue_number}/events" + + try: + + for event in github_data_access.paginate_resource(event_url): + + event, contributor = self._process_github_event_contributors(event) + + if contributor: + contributors.append(contributor) + + events.append( + extract_issue_event_data(event, issue["issue_id"], platform_id, repo_id, + self._tool_source, self._tool_version, self._data_source) + ) + except UrlNotFoundException as e: + self._logger.warning(f"{self.repo_identifier}: Url not found for {event_url}") + + if len(events) > 500: + self._insert_contributors(contributors) + self._insert_issue_events(events) + events.clear() + + if events: + self._insert_contributors(contributors) + self._insert_issue_events(events) + events.clear() + + + def _collect_and_process_pr_events(self, owner, repo, repo_id, key_auth, since): + + engine = get_engine() + + with engine.connect() as connection: + + if since: + query = text(f""" + select pull_request_id, pr_src_number as gh_pr_number, pr_src_id + from pull_requests + where repo_id={repo_id} + and pr_updated_at > timestamptz(timestamp '{since}') + order by pr_created_at desc; + """) + else: + query = text(f""" + select pull_request_id, pr_src_number as gh_pr_number, pr_src_id + from pull_requests + where repo_id={repo_id} + order by pr_created_at desc; + """) + + pr_result = connection.execute(query).fetchall() + + events = [] + contributors = [] + github_data_access = GithubDataAccess(key_auth, self._logger) + for db_pr in pr_result: + pr = db_pr._asdict() + + pr_number = pr["gh_pr_number"] + + event_url = f"https://api.github.com/repos/{owner}/{repo}/issues/{pr_number}/events" + + try: + + for event in github_data_access.paginate_resource(event_url): + + event, contributor = self._process_github_event_contributors(event) + + if contributor: + contributors.append(contributor) + + events.append( + extract_pr_event_data(event, pr["pull_request_id"], pr["pr_src_id"] , platform_id, repo_id, + self._tool_source, self._tool_version, self._data_source) + ) + except UrlNotFoundException: + self._logger.warning(f"{self.repo_identifier}: Url not found for {event_url}") + continue + + if len(events) > 500: + self._insert_contributors(contributors) + self._insert_pr_events(events) + events.clear() + + if events: + self._insert_contributors(contributors) + self._insert_pr_events(events) + events.clear() diff --git a/augur/tasks/github/events/__init__.py b/augur/tasks/github/events/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/augur/tasks/github/events/core.py b/augur/tasks/github/events/core.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/augur/tasks/github/events/tasks.py b/augur/tasks/github/events/tasks.py deleted file mode 100644 index 442af9922f..0000000000 --- a/augur/tasks/github/events/tasks.py +++ /dev/null @@ -1,235 +0,0 @@ -import logging -import traceback -import sqlalchemy as s - -from augur.tasks.init.celery_app import celery_app as celery -from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask -from augur.application.db.data_parse import * -from augur.tasks.github.util.github_paginator import GithubPaginator -from augur.tasks.github.util.github_task_session import GithubTaskManifest -from augur.tasks.github.util.util import get_owner_repo -from augur.tasks.util.worker_util import remove_duplicate_dicts -from augur.application.db.models import PullRequest, PullRequestEvent, Issue, IssueEvent, Contributor, Repo -from augur.application.db.util import execute_session_query - -platform_id = 1 - -@celery.task(base=AugurCoreRepoCollectionTask) -def collect_events(repo_git: str): - - logger = logging.getLogger(collect_events.__name__) - - with GithubTaskManifest(logger) as manifest: - - augur_db = manifest.augur_db - - try: - - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id - - owner, repo = get_owner_repo(repo_git) - - logger.info(f"Collecting Github events for {owner}/{repo}") - - url = f"https://api.github.com/repos/{owner}/{repo}/issues/events" - - event_data = retrieve_all_event_data(repo_git, logger, manifest.key_auth) - - if event_data: - - process_events(event_data, f"{owner}/{repo}: Event task", repo_id, logger, manifest.augur_db) - - else: - logger.info(f"{owner}/{repo} has no events") - except Exception as e: - logger.error(f"Could not collect events for {repo_git}\n Reason: {e} \n Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}") - - -def retrieve_all_event_data(repo_git: str, logger, key_auth): - - owner, repo = get_owner_repo(repo_git) - - logger.info(f"Collecting Github events for {owner}/{repo}") - - url = f"https://api.github.com/repos/{owner}/{repo}/issues/events" - - # returns an iterable of all issues at this url (this essentially means you can treat the issues variable as a list of the issues) - events = GithubPaginator(url, key_auth, logger) - - - num_pages = events.get_num_pages() - all_data = [] - for page_data, page in events.iter_pages(): - - if page_data is None: - return all_data - - elif len(page_data) == 0: - logger.debug(f"{repo.capitalize()} Events Page {page} contains no data...returning") - logger.info(f"Events Page {page} of {num_pages}") - return all_data - - logger.info(f"{repo} Events Page {page} of {num_pages}") - - all_data += page_data - - return all_data - -def process_events(events, task_name, repo_id, logger, augur_db): - - tool_source = "Github events task" - tool_version = "2.0" - data_source = "Github API" - - pr_event_dicts = [] - issue_event_dicts = [] - contributors = [] - - - # create mapping from issue url to issue id of current issues - issue_url_to_id_map = {} - issues = augur_db.session.query(Issue).filter(Issue.repo_id == repo_id).all() - for issue in issues: - issue_url_to_id_map[issue.issue_url] = issue.issue_id - - # create mapping from pr url to pr id of current pull requests - pr_url_to_id_map = {} - prs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() - for pr in prs: - pr_url_to_id_map[pr.pr_url] = pr.pull_request_id - - not_mapable_event_count = 0 - event_len = len(events) - for event in events: - - event, contributor = process_github_event_contributors(logger, event, tool_source, tool_version, data_source) - - # event_mapping_data is the pr or issue data needed to relate the event to an issue or pr - event_mapping_data = event["issue"] - - if event_mapping_data is None: - not_mapable_event_count += 1 - continue - - pull_request = event_mapping_data.get('pull_request', None) - if pull_request: - pr_url = pull_request["url"] - - try: - pull_request_id = pr_url_to_id_map[pr_url] - - # query = augur_db.session.query(PullRequest).filter(PullRequest.pr_url == pr_url) - # related_pr = execute_session_query(query, 'one') - except KeyError: - logger.info(f"{task_name}: Could not find related pr") - logger.info(f"{task_name}: We were searching for: {pr_url}") - logger.info(f"{task_name}: Skipping") - continue - - pr_event_dicts.append( - extract_pr_event_data(event, pull_request_id, platform_id, repo_id, - tool_source, tool_version, data_source) - ) - - else: - issue_url = event_mapping_data["url"] - - try: - issue_id = issue_url_to_id_map[issue_url] - # query = augur_db.session.query(Issue).filter(Issue.issue_url == issue_url) - # related_issue = execute_session_query(query, 'one') - except KeyError: - logger.info(f"{task_name}: Could not find related pr") - logger.info(f"{task_name}: We were searching for: {issue_url}") - logger.info(f"{task_name}: Skipping") - continue - - issue_event_dicts.append( - extract_issue_event_data(event, issue_id, platform_id, repo_id, - tool_source, tool_version, data_source) - ) - - # add contributor to list after porcessing the event, - # so if it fails processing for some reason the contributor is not inserted - # NOTE: contributor is none when there is no contributor data on the event - if contributor: - contributors.append(contributor) - - # remove contributors that were found in the data more than once - contributors = remove_duplicate_dicts(contributors) - - augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) - - issue_events_len = len(issue_event_dicts) - pr_events_len = len(pr_event_dicts) - if event_len != (issue_events_len + pr_events_len): - - unassigned_events = event_len - issue_events_len - pr_events_len - - logger.error(f"{task_name}: {event_len} events were processed, but {pr_events_len} pr events were found and related to a pr, and {issue_events_len} issue events were found and related to an issue. {not_mapable_event_count} events were not related to a pr or issue due to the api returning insufficient data. For some reason {unassigned_events} events were not able to be processed even when the api returned sufficient data. This is usually because pull requests or issues have not been collected, and the events are skipped because they cannot be related to a pr or issue") - - logger.info(f"{task_name}: Inserting {len(pr_event_dicts)} pr events and {len(issue_event_dicts)} issue events") - - # TODO: Could replace this with "id" but it isn't stored on the table for some reason - pr_event_natural_keys = ["node_id"] - augur_db.insert_data(pr_event_dicts, PullRequestEvent, pr_event_natural_keys) - - issue_event_natural_keys = ["issue_id", "issue_event_src_id"] - augur_db.insert_data(issue_event_dicts, IssueEvent, issue_event_natural_keys) - - update_issue_closed_cntrbs_from_events(augur_db.engine, repo_id) - -# TODO: Should we skip an event if there is no contributor to resolve it o -def process_github_event_contributors(logger, event, tool_source, tool_version, data_source): - - if event["actor"]: - - event_cntrb = extract_needed_contributor_data(event["actor"], tool_source, tool_version, data_source) - event["cntrb_id"] = event_cntrb["cntrb_id"] - - else: - event["cntrb_id"] = None - return event, None - - return event, event_cntrb - - -def update_issue_closed_cntrbs_from_events(engine, repo_id): - - get_ranked_issues = s.text(f""" - WITH RankedIssues AS ( - SELECT repo_id, issue_id, cntrb_id, - ROW_NUMBER() OVER(PARTITION BY issue_id ORDER BY created_at DESC) AS rn - FROM issue_events - WHERE "action" = 'closed' - ) - - SELECT issue_id, cntrb_id from RankedIssues where rn=1 and repo_id={repo_id} and cntrb_id is not NULL - """) - - with engine.connect() as conn: - result = conn.execute(get_ranked_issues).fetchall() - - update_data = [] - for row in result: - update_data.append( - { - 'issue_id': row[0], - 'cntrb_id': row[1], - 'repo_id': repo_id - } - ) - - if update_data: - with engine.connect() as connection: - update_stmt = s.text(""" - UPDATE issues - SET cntrb_id = :cntrb_id - WHERE issue_id = :issue_id - AND repo_id = :repo_id - """) - connection.execute(update_stmt, update_data) - - diff --git a/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py b/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py index fab5ef6885..4303cc3193 100644 --- a/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py +++ b/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py @@ -3,11 +3,11 @@ import time import sqlalchemy as s from augur.application.db.models import * -from augur.tasks.github.util.github_paginator import hit_api, process_dict_response, retrieve_dict_from_endpoint +from augur.tasks.github.util.github_paginator import hit_api, process_dict_response +from augur.tasks.github.util.github_data_access import GithubDataAccess # Debugger -import traceback from augur.tasks.github.util.github_paginator import GithubApiResult -from augur.application.db.util import execute_session_query +from augur.application.db.lib import get_repo_by_repo_id, bulk_insert_dicts, execute_sql, get_contributors_by_github_user_id ##TODO: maybe have a TaskSession class that holds information about the database, logger, config, etc. @@ -23,19 +23,17 @@ # Hit the endpoint specified by the url and return the json that it returns if it returns a dict. # Returns None on failure. -# NOTE: This function is being deprecated in favor of retrieve_dict_from_endpoint -def request_dict_from_endpoint(session, url, timeout_wait=10): - #session.logger.info(f"Hitting endpoint: {url}") - +# NOTE: This function is being deprecated in favor of GithubDataAcess.get_resource() +def request_dict_from_endpoint(logger, session, url, timeout_wait=10): attempts = 0 response_data = None success = False while attempts < 10: try: - response = hit_api(session.oauths, url, session.logger) + response = hit_api(session.oauths, url, logger) except TimeoutError: - session.logger.info( + logger.warning( f"User data request for enriching contributor data failed with {attempts} attempts! Trying again...") time.sleep(timeout_wait) continue @@ -50,34 +48,33 @@ def request_dict_from_endpoint(session, url, timeout_wait=10): response_data = json.loads(json.dumps(response.text)) if type(response_data) == dict: - err = process_dict_response(session.logger,response,response_data) + err = process_dict_response(logger,response,response_data) #If we get an error message that's not None if err and err != GithubApiResult.SUCCESS: attempts += 1 - session.logger.info(f"err: {err}") + logger.warning(f"err: {err}") continue - #session.logger.info(f"Returned dict: {response_data}") success = True break elif type(response_data) == list: - session.logger.warning("Wrong type returned, trying again...") - session.logger.info(f"Returned list: {response_data}") + logger.warning("Wrong type returned, trying again...") + logger.debug(f"Returned list: {response_data}") elif type(response_data) == str: - session.logger.info( + logger.warning( f"Warning! page_data was string: {response_data}") if "" in response_data: - session.logger.info("HTML was returned, trying again...\n") + logger.warning("HTML was returned, trying again...\n") elif len(response_data) == 0: - session.logger.warning("Empty string, trying again...\n") + logger.warning("Empty string, trying again...\n") else: try: # Sometimes raw text can be converted to a dict response_data = json.loads(response_data) - err = process_dict_response(session.logger,response,response_data) + err = process_dict_response(logger,response,response_data) #If we get an error message that's not None if err and err != GithubApiResult.SUCCESS: @@ -95,7 +92,6 @@ def request_dict_from_endpoint(session, url, timeout_wait=10): def create_endpoint_from_email(email): - #self.logger.info(f"Trying to resolve contributor from email: {email}") # Note: I added "+type:user" to avoid having user owned organizations be returned # Also stopped splitting per note above. url = 'https://api.github.com/search/users?q={}+in:email+type:user'.format( @@ -105,39 +101,33 @@ def create_endpoint_from_email(email): return url -def create_endpoint_from_commit_sha(logger,db,commit_sha, repo_id): - logger.info( +def create_endpoint_from_commit_sha(logger, commit_sha, repo_id): + logger.debug( f"Trying to create endpoint from commit hash: {commit_sha}") # https://api.github.com/repos/chaoss/augur/commits/53b0cc122ac9ecc1588d76759dc2e8e437f45b48 #stmnt = s.select(Repo.repo_path, Repo.repo_name).where(Repo.repo_id == repo_id) - - query = db.query(Repo).filter_by(repo_id=repo_id) - result = execute_session_query(query, 'one') + result = get_repo_by_repo_id(repo_id) if result.repo_path is None or result.repo_name is None: raise KeyError # Else put into a more readable local var - #session.logger.info(f"Result: {result}") split_git = result.repo_git.split('/') repo_name_and_org = split_git[-2] + "/" + result.repo_name url = "https://api.github.com/repos/" + repo_name_and_org + "/commits/" + commit_sha - logger.info(f"Url: {url}") + logger.debug(f"Commit Hash URL: {url}") return url # Try to construct the best url to ping GitHub's API for a username given a full name. def create_endpoint_from_name(contributor): - #self.logger.info( - # f"Trying to resolve contributor from name: {contributor}") - # Try to get the 'names' field if 'commit_name' field is not present in contributor data. name_field = 'cmt_author_name' if 'commit_name' in contributor else 'name' @@ -154,30 +144,28 @@ def create_endpoint_from_name(contributor): return url -def insert_alias(logger,db, contributor, email): +def insert_alias(logger, contributor, email): # Insert cntrb_id and email of the corresponding record into the alias table # Another database call to get the contributor id is needed because its an autokeyincrement that is accessed by multiple workers # Same principle as enrich_cntrb_id method. - query = db.query(Contributor).filter_by(gh_user_id=contributor["gh_user_id"]) - contributor_table_data = execute_session_query(query, 'all') - # self.logger.info(f"Contributor query: {contributor_table_data}") + contributor_table_data = get_contributors_by_github_user_id(contributor["gh_user_id"]) # Handle potential failures if len(contributor_table_data) == 1: - logger.info( + logger.debug( f"cntrb_id {contributor_table_data[0].cntrb_id} found in database and assigned to enriched data") elif len(contributor_table_data) == 0: logger.error("Couldn't find contributor in database. Something has gone very wrong. Augur ran into a contributor whose login can be found in the contributor's table, but cannot be retrieved via the user_id that was gotten using the same login.") raise LookupError else: - logger.info( + logger.warning( f"There are more than one contributors in the table with gh_user_id={contributor['gh_user_id']}") - #session.logger.info(f"Creating alias for email: {email}") + logger.debug(f"Creating alias for email: {email}") - #session.logger.info(f"{contributor_table_data} has type {type(contributor_table_data)}") + #logger.info(f"{contributor_table_data} has type {type(contributor_table_data)}") # Insert a new alias that corresponds to where the contributor was found # use the email of the new alias for canonical_email if the api returns NULL # TODO: It might be better to have the canonical_email allowed to be NUll because right now it has a null constraint. @@ -192,7 +180,7 @@ def insert_alias(logger,db, contributor, email): # Insert new alias - db.insert_data(alias, ContributorsAlias, ['alias_email']) + bulk_insert_dicts(logger, alias, ContributorsAlias, ['alias_email']) return @@ -200,7 +188,7 @@ def insert_alias(logger,db, contributor, email): # Takes the user data from the endpoint as arg # Updates the alias table if the login is already in the contributor's table with the new email. # Returns whether the login was found in the contributors table -def resolve_if_login_existing(session, contributor): +def resolve_if_login_existing(logger, contributor): # check if login exists in contributors table select_cntrbs_query = s.sql.text(""" SELECT cntrb_id from contributors @@ -210,7 +198,7 @@ def resolve_if_login_existing(session, contributor): # Bind parameter select_cntrbs_query = select_cntrbs_query.bindparams( gh_login_value=contributor['cntrb_login']) - result = session.execute_sql(select_cntrbs_query) + result = execute_sql(select_cntrbs_query) # if yes if len(result.fetchall()) >= 1: @@ -218,8 +206,6 @@ def resolve_if_login_existing(session, contributor): return True # If not found, return false - session.logger.info( - f"Contributor not found in contributors table but can be added. Adding...") return False """ No longer used after orm upsert implement @@ -276,12 +262,11 @@ def fetch_username_from_email(logger, auth, commit): # Default to failed state login_json = None - #session.logger.info(f"Here is the commit: {commit}") + #logger.info(f"Here is the commit: {commit}") # email = commit['email_raw'] if 'email_raw' in commit else commit['email_raw'] if len(commit['email_raw']) <= 2: - logger.info("Email less than two characters") return login_json # Don't bother with emails that are blank or less than 2 characters try: @@ -291,27 +276,27 @@ def fetch_username_from_email(logger, auth, commit): f"Couldn't resolve email url with given data. Reason: {e}") # If the method throws an error it means that we can't hit the endpoint so we can't really do much return login_json - - login_json, _ = retrieve_dict_from_endpoint(logger, auth, url) + github_data_access = GithubDataAccess(auth, logger) + + login_json = github_data_access.get_resource(url) + # Check if the email result got anything, if it failed try a name search. if login_json is None or 'total_count' not in login_json or login_json['total_count'] == 0: - logger.info( + logger.warning( f"Could not resolve the username from {commit['email_raw']}") - logger.info(f"email api url {url}") + logger.debug(f"email api url {url}") return None - else: - # Return endpoint dictionary if email found it. - return login_json - # failure condition returns None return login_json # Method to return the login given commit data using the supplemental data in the commit # -email # -name -def get_login_with_supplemental_data(logger,db,auth, commit_data): +def get_login_with_supplemental_data(logger, auth, commit_data): + + github_data_access = GithubDataAccess(auth, logger) # Try to get login from all possible emails # Is None upon failure. @@ -324,40 +309,45 @@ def get_login_with_supplemental_data(logger,db,auth, commit_data): "email": commit_data['email_raw'], "name": commit_data['name'], } - logger.info(f"Inserting data to unresolved: {unresolved}") + logger.debug(f"Inserting data to unresolved: {unresolved}") try: unresolved_natural_keys = ['email'] - db.insert_data(unresolved, UnresolvedCommitEmail, unresolved_natural_keys) + bulk_insert_dicts(logger, unresolved, UnresolvedCommitEmail, unresolved_natural_keys) except Exception as e: logger.error( f"Could not create new unresolved email {unresolved['email']}. Error: {e}") - logger.info( + logger.warning( "Could not resolve the username from the email. Trying a name only search...") try: url = create_endpoint_from_name(commit_data) except Exception as e: - logger.info( + logger.warning( f"Couldn't resolve name url with given data. Reason: {e}") return None - - login_json, _ = retrieve_dict_from_endpoint(logger, auth, url) + + login_json = github_data_access.get_resource(url) # total_count is the count of username's found by the endpoint. if login_json is None or 'total_count' not in login_json: - logger.info( + logger.error( "Search query returned an empty response, moving on...\n") return None if login_json['total_count'] == 0: - logger.info( + logger.error( "Search query did not return any results, adding commit's table remains null...\n") return None # Grab first result and make sure it has the highest match score - match = login_json['items'][0] + try: + match = login_json['items'][0] + except IndexError as e: + logger.error(f"Ran into error {e} when parsing users with search url: {url}\n return dict: {login_json}") + return None + for item in login_json['items']: if item['score'] > match['score']: match = item @@ -367,18 +357,20 @@ def get_login_with_supplemental_data(logger,db,auth, commit_data): return match['login'] -def get_login_with_commit_hash(logger,db,auth, commit_data, repo_id): +def get_login_with_commit_hash(logger, auth, commit_data, repo_id): # Get endpoint for login from hash url = create_endpoint_from_commit_sha( - logger,db,commit_data['hash'], repo_id) + logger, commit_data['hash'], repo_id) #TODO: here. # Send api request - login_json, _ = retrieve_dict_from_endpoint(logger, auth, url)#request_dict_from_endpoint(session,url) + github_data_access = GithubDataAccess(auth, logger) + login_json = github_data_access.get_resource(url) + # TODO: Why are we returning None if 'sha' is not in response if we aren't even using it? if login_json is None or 'sha' not in login_json: - logger.info(f"Search query returned empty data. Moving on. Data: {login_json}") + logger.debug(f"Search query returned empty data. Moving on. Data: {login_json}") return None try: @@ -387,23 +379,3 @@ def get_login_with_commit_hash(logger,db,auth, commit_data, repo_id): match = None return match - - - -def create_endpoint_from_repo_id(logger,db, repo_id): - - """ - SELECT repo_git from repo - WHERE repo_id = :repo_id_bind - """ - #ORM syntax of above statement - query = db.session.query(Repo).filter_by(repo_id=repo_id) - result = execute_session_query(query, 'one') - - url = result.repo_git - logger.info(f"Url: {url}") - - return url - - - diff --git a/augur/tasks/github/facade_github/core.py b/augur/tasks/github/facade_github/core.py index 10f4affc6a..55b2281ad8 100644 --- a/augur/tasks/github/facade_github/core.py +++ b/augur/tasks/github/facade_github/core.py @@ -1,23 +1,28 @@ from augur.tasks.github.facade_github.contributor_interfaceable.contributor_interface import * from augur.tasks.github.util.util import get_owner_repo from augur.tasks.github.util.github_task_session import * -from augur.tasks.github.util.github_paginator import * from augur.application.db.models import * from augur.tasks.util.AugurUUID import GithubUUID +from augur.application.db.lib import bulk_insert_dicts, batch_insert_contributors +from augur.tasks.github.util.github_data_access import GithubDataAccess -def query_github_contributors(manifest, github_url): + +def query_github_contributors(logger, key_auth, github_url): """ Data collection function Query the GitHub API for contributors """ + # Set platform id to 1 since it is a github method + platform_id = 1 + # Extract owner/repo from the url for the endpoint try: owner, name = get_owner_repo(github_url) except IndexError as e: - manifest.logger.error(f"Encountered bad url: {github_url}") + logger.error(f"Encountered bad url: {github_url}") raise e # Set the base of the url and place to hold contributors to insert @@ -34,17 +39,16 @@ def query_github_contributors(manifest, github_url): update_col_map = {'cntrb_email': 'email'} duplicate_col_map = {'cntrb_login': 'login'} - #list to hold contributors needing insertion or update - contributor_list = GithubPaginator(contributors_url, manifest.key_auth,manifest.logger)#paginate(contributors_url, duplicate_col_map, update_col_map, table, table_pkey) + github_data_access = GithubDataAccess(key_auth, logger) - len_contributor_list = len(contributor_list) + contributor_count = github_data_access.get_resource_count(contributors_url) - manifest.logger.info("Count of contributors needing insertion: " + str(len_contributor_list) + "\n") + logger.info("Count of contributors needing insertion: " + str(contributor_count) + "\n") - if len_contributor_list == 0: + if contributor_count == 0: return - for repo_contributor in contributor_list: + for repo_contributor in github_data_access.paginate_resource(contributors_url): try: # Need to hit this single contributor endpoint to get extra data including... # `created at` @@ -52,13 +56,13 @@ def query_github_contributors(manifest, github_url): cntrb_url = ("https://api.github.com/users/" + repo_contributor['login']) - manifest.logger.info("Hitting endpoint: " + cntrb_url + " ...\n") - #r = hit_api(session.oauths, cntrb_url, session.logger) + logger.info("Hitting endpoint: " + cntrb_url + " ...\n") + #r = hit_api(session.oauths, cntrb_url, logger) #contributor = r.json() - contributor, result = retrieve_dict_from_endpoint(manifest.logger,manifest.key_auth, cntrb_url) + contributor = github_data_access.get_resource(cntrb_url) - #manifest.logger.info(f"Contributor: {contributor} \n") + #logger.info(f"Contributor: {contributor} \n") company = None location = None email = None @@ -76,7 +80,7 @@ def query_github_contributors(manifest, github_url): #cntrb_id = AugurUUID(session.platform_id,contributor['id']).to_UUID() cntrb_id = GithubUUID() cntrb_id["user"] = int(contributor['id']) - cntrb_id["platform"] = manifest.platform_id + cntrb_id["platform"] = platform_id cntrb = { "cntrb_id" : cntrb_id.to_UUID(), @@ -112,23 +116,19 @@ def query_github_contributors(manifest, github_url): #"data_source": session.data_source } - cntrb_natural_keys = ['cntrb_id'] #insert cntrb to table. #session.logger.info(f"Contributor: {cntrb} \n") - manifest.augur_db.insert_data(cntrb,Contributor,cntrb_natural_keys) + batch_insert_contributors(logger, [cntrb]) except Exception as e: - manifest.logger.error("Caught exception: {}".format(e)) - manifest.logger.error("Cascading Contributor Anomalie from missing repo contributor data: {} ...\n".format(cntrb_url)) + logger.error("Caught exception: {}".format(e)) + logger.error("Cascading Contributor Anomalie from missing repo contributor data: {} ...\n".format(cntrb_url)) raise e # Get all the committer data for a repo. # Used by facade in facade03analyzecommit -def grab_committer_list(manifest, repo_id, platform="github"): +def grab_committer_list(logger, key_auth, repo_git, platform="github"): # Create API endpoint from repo_id - - endpoint = create_endpoint_from_repo_id(manifest.logger,manifest.augur_db, repo_id) - - query_github_contributors(manifest,endpoint) + query_github_contributors(logger, key_auth, repo_git) \ No newline at end of file diff --git a/augur/tasks/github/facade_github/tasks.py b/augur/tasks/github/facade_github/tasks.py index 64ce4f7409..1b11f98223 100644 --- a/augur/tasks/github/facade_github/tasks.py +++ b/augur/tasks/github/facade_github/tasks.py @@ -3,15 +3,17 @@ from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurFacadeRepoCollectionTask -from augur.tasks.github.util.github_paginator import retrieve_dict_from_endpoint -from augur.tasks.github.util.github_task_session import GithubTaskManifest +from augur.tasks.github.util.github_data_access import GithubDataAccess, UrlNotFoundException +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth from augur.application.db.models import Contributor from augur.tasks.github.facade_github.core import * -from augur.application.db.util import execute_session_query +from augur.application.db.lib import execute_sql, get_contributor_aliases_by_email, get_unresolved_commit_emails_by_name, get_contributors_by_full_name, get_repo_by_repo_git, batch_insert_contributors from augur.tasks.git.util.facade_worker.facade_worker.facade00mainprogram import * -def process_commit_metadata(logger,db,auth,contributorQueue,repo_id,platform_id): +def process_commit_metadata(logger, auth, contributorQueue, repo_id, platform_id): + + github_data_access = GithubDataAccess(auth, logger) for contributor in contributorQueue: # Get the email from the commit data @@ -22,11 +24,10 @@ def process_commit_metadata(logger,db,auth,contributorQueue,repo_id,platform_id) # check the email to see if it already exists in contributor_aliases # Look up email to see if resolved - query = db.query(ContributorsAlias).filter_by(alias_email=email) - alias_table_data = execute_session_query(query, 'all') + alias_table_data = get_contributor_aliases_by_email(email) if len(alias_table_data) >= 1: # Move on if email resolved - logger.info( + logger.debug( f"Email {email} has been resolved earlier.") continue @@ -34,35 +35,33 @@ def process_commit_metadata(logger,db,auth,contributorQueue,repo_id,platform_id) #Check the unresolved_commits table to avoid hitting endpoints that we know don't have relevant data needlessly - query = db.query(UnresolvedCommitEmail).filter_by(name=name) - unresolved_query_result = execute_session_query(query, 'all') + unresolved_query_result = get_unresolved_commit_emails_by_name(name) if len(unresolved_query_result) >= 1: - logger.info(f"Commit data with email {email} has been unresolved in the past, skipping...") + logger.debug(f"Commit data with email {email} has been unresolved in the past, skipping...") continue login = None #Check the contributors table for a login for the given name - query = db.query(Contributor).filter_by(cntrb_full_name=name) - contributors_with_matching_name = execute_session_query(query, 'first') + contributors_with_matching_name = get_contributors_by_full_name(name) - if not contributors_with_matching_name: + if not contributors_with_matching_name or len(contributors_with_matching_name) > 1: logger.debug("Failed local login lookup") else: - login = contributors_with_matching_name.gh_login + login = contributors_with_matching_name[0].gh_login # Try to get the login from the commit sha if login == None or login == "": - login = get_login_with_commit_hash(logger,db,auth,contributor, repo_id) + login = get_login_with_commit_hash(logger, auth, contributor, repo_id) if login == None or login == "": - logger.info("Failed to get login from commit hash") + logger.warning("Failed to get login from commit hash") # Try to get the login from supplemental data if not found with the commit hash - login = get_login_with_supplemental_data(logger, db, auth,contributor) + login = get_login_with_supplemental_data(logger, auth,contributor) if login == None or login == "": logger.error("Failed to get login from supplemental data!") @@ -70,11 +69,10 @@ def process_commit_metadata(logger,db,auth,contributorQueue,repo_id,platform_id) url = ("https://api.github.com/users/" + login) - user_data, _ = retrieve_dict_from_endpoint(logger, auth, url) - - if user_data == None: - logger.warning( - f"user_data was unable to be reached. Skipping...") + try: + user_data = github_data_access.get_resource(url) + except UrlNotFoundException as e: + logger.warning(f"User of {login} not found on github. Skipping...") continue # Use the email found in the commit data if api data is NULL @@ -129,17 +127,15 @@ def process_commit_metadata(logger,db,auth,contributorQueue,repo_id,platform_id) #Executes an upsert with sqlalchemy cntrb_natural_keys = ['cntrb_id'] - - db.insert_data(cntrb,Contributor,cntrb_natural_keys) - + batch_insert_contributors(logger, [cntrb]) try: # Update alias after insertion. Insertion needs to happen first so we can get the autoincrementkey - insert_alias(logger, db,cntrb, emailFromCommitData) + insert_alias(logger, cntrb, emailFromCommitData) except LookupError as e: - logger.info( + logger.error( ''.join(traceback.format_exception(None, e, e.__traceback__))) - logger.info( + logger.error( f"Contributor id not able to be found in database despite the user_id existing. Something very wrong is happening. Error: {e}") return @@ -156,12 +152,12 @@ def process_commit_metadata(logger,db,auth,contributorQueue,repo_id,platform_id) WHERE email='{}' """.format(escapedEmail)) - logger.info(f"Updating now resolved email {email}") + logger.debug(f"Updating now resolved email {email}") try: - db.execute_sql(query) + execute_sql(query) except Exception as e: - logger.info( + logger.error( f"Deleting now resolved email failed with error: {e}") raise e @@ -169,11 +165,11 @@ def process_commit_metadata(logger,db,auth,contributorQueue,repo_id,platform_id) return -def link_commits_to_contributor(session,contributorQueue): +def link_commits_to_contributor(logger, facade_helper, contributorQueue): # # iterate through all the commits with emails that appear in contributors and give them the relevant cntrb_id. for cntrb in contributorQueue: - session.logger.debug( + logger.debug( f"These are the emails and cntrb_id's returned: {cntrb}") query = s.sql.text(""" @@ -186,7 +182,7 @@ def link_commits_to_contributor(session,contributorQueue): """).bindparams(cntrb_id=cntrb["cntrb_id"],cntrb_email=cntrb["email"]) #engine.execute(query, **data) - session.insert_or_update_data(query) + facade_helper.insert_or_update_data(query) return @@ -194,115 +190,110 @@ def link_commits_to_contributor(session,contributorQueue): # Update the contributors table from the data facade has gathered. @celery.task(base=AugurFacadeRepoCollectionTask, bind=True) -def insert_facade_contributors(self, repo_id): +def insert_facade_contributors(self, repo_git): - engine = self.app.engine + # Set platform id to 1 since this task is github specific + platform_id = 1 logger = logging.getLogger(insert_facade_contributors.__name__) + repo = get_repo_by_repo_git(repo_git) + repo_id = repo.repo_id - with GithubTaskManifest(logger) as manifest: - - - # Get all of the commit data's emails and names from the commit table that do not appear - # in the contributors table or the contributors_aliases table. - - manifest.logger.info( - "Beginning process to insert contributors from facade commits for repo w entry info: {}\n".format(repo_id)) - new_contrib_sql = s.sql.text(""" - SELECT DISTINCT - commits.cmt_author_name AS NAME, - commits.cmt_commit_hash AS hash, - commits.cmt_author_raw_email AS email_raw, - 'not_unresolved' as resolution_status - FROM - commits - WHERE - commits.repo_id = :repo_id - AND (NOT EXISTS ( SELECT contributors.cntrb_canonical FROM contributors WHERE contributors.cntrb_canonical = commits.cmt_author_raw_email ) - or NOT EXISTS ( SELECT contributors_aliases.alias_email from contributors_aliases where contributors_aliases.alias_email = commits.cmt_author_raw_email) - AND ( commits.cmt_author_name ) IN ( SELECT C.cmt_author_name FROM commits AS C WHERE C.repo_id = :repo_id GROUP BY C.cmt_author_name )) - GROUP BY - commits.cmt_author_name, - commits.cmt_commit_hash, - commits.cmt_author_raw_email - UNION - SELECT DISTINCT - commits.cmt_author_name AS NAME,--commits.cmt_id AS id, - commits.cmt_commit_hash AS hash, - commits.cmt_author_raw_email AS email_raw, - 'unresolved' as resolution_status - FROM - commits - WHERE - commits.repo_id = :repo_id - AND EXISTS ( SELECT unresolved_commit_emails.email FROM unresolved_commit_emails WHERE unresolved_commit_emails.email = commits.cmt_author_raw_email ) - AND ( commits.cmt_author_name ) IN ( SELECT C.cmt_author_name FROM commits AS C WHERE C.repo_id = :repo_id GROUP BY C.cmt_author_name ) - GROUP BY - commits.cmt_author_name, - commits.cmt_commit_hash, - commits.cmt_author_raw_email - ORDER BY - hash - """).bindparams(repo_id=repo_id) - - #Execute statement with session. - result = manifest.augur_db.execute_sql(new_contrib_sql) - new_contribs = [dict(row) for row in result.mappings()] - - #print(new_contribs) - - #json.loads(pd.read_sql(new_contrib_sql, self.db, params={ - # 'repo_id': repo_id}).to_json(orient="records")) - - - - process_commit_metadata(manifest.logger,manifest.augur_db,manifest.key_auth,list(new_contribs),repo_id,manifest.platform_id) - - manifest.logger.debug("DEBUG: Got through the new_contribs") - + # Get all of the commit data's emails and names from the commit table that do not appear + # in the contributors table or the contributors_aliases table. - with FacadeSession(logger) as session: - # sql query used to find corresponding cntrb_id's of emails found in the contributor's table - # i.e., if a contributor already exists, we use it! - resolve_email_to_cntrb_id_sql = s.sql.text(""" + logger.info( + "Beginning process to insert contributors from facade commits for repo w entry info: {}\n".format(repo_id)) + new_contrib_sql = s.sql.text(""" SELECT DISTINCT - cntrb_id, - contributors.cntrb_login AS login, - contributors.cntrb_canonical AS email, - commits.cmt_author_raw_email + commits.cmt_author_name AS NAME, + commits.cmt_commit_hash AS hash, + commits.cmt_author_raw_email AS email_raw, + 'not_unresolved' as resolution_status FROM - contributors, commits WHERE - contributors.cntrb_canonical = commits.cmt_author_raw_email - AND commits.repo_id = :repo_id + commits.repo_id = :repo_id + AND (NOT EXISTS ( SELECT contributors.cntrb_canonical FROM contributors WHERE contributors.cntrb_canonical = commits.cmt_author_raw_email ) + or NOT EXISTS ( SELECT contributors_aliases.alias_email from contributors_aliases where contributors_aliases.alias_email = commits.cmt_author_raw_email) + AND ( commits.cmt_author_name ) IN ( SELECT C.cmt_author_name FROM commits AS C WHERE C.repo_id = :repo_id GROUP BY C.cmt_author_name )) + GROUP BY + commits.cmt_author_name, + commits.cmt_commit_hash, + commits.cmt_author_raw_email UNION SELECT DISTINCT - contributors_aliases.cntrb_id, - contributors.cntrb_login as login, - contributors_aliases.alias_email AS email, - commits.cmt_author_raw_email + commits.cmt_author_name AS NAME,--commits.cmt_id AS id, + commits.cmt_commit_hash AS hash, + commits.cmt_author_raw_email AS email_raw, + 'unresolved' as resolution_status FROM - contributors, - contributors_aliases, commits WHERE - contributors_aliases.alias_email = commits.cmt_author_raw_email - AND contributors.cntrb_id = contributors_aliases.cntrb_id - AND commits.repo_id = :repo_id - """).bindparams(repo_id=repo_id) + commits.repo_id = :repo_id + AND EXISTS ( SELECT unresolved_commit_emails.email FROM unresolved_commit_emails WHERE unresolved_commit_emails.email = commits.cmt_author_raw_email ) + AND ( commits.cmt_author_name ) IN ( SELECT C.cmt_author_name FROM commits AS C WHERE C.repo_id = :repo_id GROUP BY C.cmt_author_name ) + GROUP BY + commits.cmt_author_name, + commits.cmt_commit_hash, + commits.cmt_author_raw_email + ORDER BY + hash + """).bindparams(repo_id=repo_id) + + #Execute statement with session. + result = execute_sql(new_contrib_sql) + new_contribs = [dict(row) for row in result.mappings()] - #self.logger.info("DEBUG: got passed the sql statement declaration") - # Get a list of dicts that contain the emails and cntrb_id's of commits that appear in the contributor's table. - #existing_cntrb_emails = json.loads(pd.read_sql(resolve_email_to_cntrb_id_sql, self.db, params={ - # 'repo_id': repo_id}).to_json(orient="records")) + #print(new_contribs) - result = session.execute_sql(resolve_email_to_cntrb_id_sql) - existing_cntrb_emails = [dict(row) for row in result.mappings()] + #json.loads(pd.read_sql(new_contrib_sql, self.db, params={ + # 'repo_id': repo_id}).to_json(orient="records")) - print(existing_cntrb_emails) - link_commits_to_contributor(session,list(existing_cntrb_emails)) - session.logger.info("Done with inserting and updating facade contributors") + key_auth = GithubRandomKeyAuth(logger) + + process_commit_metadata(logger, key_auth, list(new_contribs), repo_id, platform_id) + + logger.debug("DEBUG: Got through the new_contribs") + + facade_helper = FacadeHelper(logger) + # sql query used to find corresponding cntrb_id's of emails found in the contributor's table + # i.e., if a contributor already exists, we use it! + resolve_email_to_cntrb_id_sql = s.sql.text(""" + SELECT DISTINCT + cntrb_id, + contributors.cntrb_login AS login, + contributors.cntrb_canonical AS email, + commits.cmt_author_raw_email + FROM + contributors, + commits + WHERE + contributors.cntrb_canonical = commits.cmt_author_raw_email + AND commits.repo_id = :repo_id + UNION + SELECT DISTINCT + contributors_aliases.cntrb_id, + contributors.cntrb_login as login, + contributors_aliases.alias_email AS email, + commits.cmt_author_raw_email + FROM + contributors, + contributors_aliases, + commits + WHERE + contributors_aliases.alias_email = commits.cmt_author_raw_email + AND contributors.cntrb_id = contributors_aliases.cntrb_id + AND commits.repo_id = :repo_id + """).bindparams(repo_id=repo_id) + + + result = execute_sql(resolve_email_to_cntrb_id_sql) + existing_cntrb_emails = [dict(row) for row in result.mappings()] + + print(existing_cntrb_emails) + link_commits_to_contributor(logger, facade_helper,list(existing_cntrb_emails)) + return diff --git a/augur/tasks/github/issues/tasks.py b/augur/tasks/github/issues.py similarity index 64% rename from augur/tasks/github/issues/tasks.py rename to augur/tasks/github/issues.py index baccfdc60e..37bee5c8dd 100644 --- a/augur/tasks/github/issues/tasks.py +++ b/augur/tasks/github/issues.py @@ -1,5 +1,6 @@ import logging import traceback +from datetime import timedelta, timezone from sqlalchemy.exc import IntegrityError @@ -7,64 +8,55 @@ from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.application.db.data_parse import * -from augur.tasks.github.util.github_paginator import GithubPaginator -from augur.tasks.github.util.github_task_session import GithubTaskManifest +from augur.tasks.github.util.github_data_access import GithubDataAccess +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth from augur.tasks.github.util.util import add_key_value_pair_to_dicts, get_owner_repo from augur.tasks.util.worker_util import remove_duplicate_dicts -from augur.application.db.models import Issue, IssueLabel, IssueAssignee, Contributor, Repo +from augur.application.db.models import Issue, IssueLabel, IssueAssignee, Contributor from augur.application.config import get_development_flag -from augur.application.db.util import execute_session_query +from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts, get_core_data_last_collected, batch_insert_contributors + development = get_development_flag() @celery.task(base=AugurCoreRepoCollectionTask) -def collect_issues(repo_git : str) -> int: - +def collect_issues(repo_git : str, full_collection: bool) -> int: logger = logging.getLogger(collect_issues.__name__) - with GithubTaskManifest(logger) as manifest: - augur_db = manifest.augur_db + repo_id = get_repo_by_repo_git(repo_git).repo_id - logger.info(f'this is the manifest.key_auth value: {str(manifest.key_auth)}') + owner, repo = get_owner_repo(repo_git) - try: - - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id - - #try this - # the_key = manifest.key_auth - # try: - # randomon = GithubApiKeyHandler(augur_db.session) - # the_key = randomon.get_random_key() - # logger.info(f'The Random Key {the_key}') - # except Exception as e: - # logger.info(f'error: {e}') - # the_key = manifest.key_auth - # pass - - owner, repo = get_owner_repo(repo_git) - - issue_data = retrieve_all_issue_data(repo_git, logger, manifest.key_auth) - #issue_data = retrieve_all_issue_data(repo_git, logger, the_key) - - if issue_data: - total_issues = len(issue_data) - process_issues(issue_data, f"{owner}/{repo}: Issue task", repo_id, logger, augur_db) - - return total_issues - else: - logger.info(f"{owner}/{repo} has no issues") - return 0 - except Exception as e: - logger.error(f"Could not collect issues for repo {repo_git}\n Reason: {e} \n Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}") - return -1 - - - -def retrieve_all_issue_data(repo_git, logger, key_auth) -> None: + if full_collection: + core_data_last_collected = None + else: + # subtract 2 days to ensure all data is collected + core_data_last_collected = (get_core_data_last_collected(repo_id) - timedelta(days=2)).replace(tzinfo=timezone.utc) + + key_auth = GithubRandomKeyAuth(logger) + + logger.info(f'this is the manifest.key_auth value: {str(key_auth)}') + + try: + issue_data = retrieve_all_issue_data(repo_git, logger, key_auth, core_data_last_collected) + + if not issue_data: + logger.info(f"{owner}/{repo} has no issues") + return 0 + + total_issues = len(issue_data) + process_issues(issue_data, f"{owner}/{repo}: Issue task", repo_id, logger) + + return total_issues + + except Exception as e: + logger.error(f"Could not collect issues for repo {repo_git}\n Reason: {e} \n Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}") + return -1 + + + +def retrieve_all_issue_data(repo_git, logger, key_auth, since) -> None: owner, repo = get_owner_repo(repo_git) @@ -72,34 +64,19 @@ def retrieve_all_issue_data(repo_git, logger, key_auth) -> None: url = f"https://api.github.com/repos/{owner}/{repo}/issues?state=all" - # returns an iterable of all issues at this url (this essentially means you can treat the issues variable as a list of the issues) - # Reference the code documenation for GithubPaginator for more details - issues = GithubPaginator(url, key_auth, logger) - - # this is defined so we can decrement it each time - # we come across a pr, so at the end we can log how - # many issues were collected - # loop through the issues - all_data = [] - num_pages = issues.get_num_pages() - for page_data, page in issues.iter_pages(): - - if page_data is None: - return all_data + if since: + url += f"&since={since.isoformat()}" - if len(page_data) == 0: - logger.debug( - f"{owner}/{repo}: Issues Page {page} contains no data...returning") - logger.info(f"{owner}/{repo}: Issues Page {page} of {num_pages}") - return all_data + github_data_access = GithubDataAccess(key_auth, logger) - logger.info(f"{owner}/{repo}: Issues Page {page} of {num_pages}") + num_pages = github_data_access.get_resource_page_count(url) + logger.info(f"{owner}/{repo}: Retrieving {num_pages} pages of issues") - all_data += page_data + issues_paginator = github_data_access.paginate_resource(url) - return all_data + return list(issues_paginator) -def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: +def process_issues(issues, task_name, repo_id, logger) -> None: # get repo_id or have it passed tool_source = "Issue Task" @@ -153,7 +130,7 @@ def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: # insert contributors from these issues logger.info(f"{task_name}: Inserting {len(contributors)} contributors") - augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) + batch_insert_contributors(logger, contributors) # insert the issues into the issues table. @@ -164,7 +141,7 @@ def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: issue_return_columns = ["issue_url", "issue_id"] issue_string_columns = ["issue_title", "issue_body"] try: - issue_return_data = augur_db.insert_data(issue_dicts, Issue, issue_natural_keys, return_columns=issue_return_columns, string_fields=issue_string_columns) + issue_return_data = bulk_insert_dicts(logger, issue_dicts, Issue, issue_natural_keys, return_columns=issue_return_columns, string_fields=issue_string_columns) except IntegrityError as e: logger.error(f"Ran into integrity error:{e} \n Offending data: \n{issue_dicts}") @@ -197,13 +174,13 @@ def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: # we are using label_src_id and issue_id to determine if the label is already in the database. issue_label_natural_keys = ['label_src_id', 'issue_id'] issue_label_string_fields = ["label_text", "label_description"] - augur_db.insert_data(issue_label_dicts, IssueLabel, + bulk_insert_dicts(logger, issue_label_dicts, IssueLabel, issue_label_natural_keys, string_fields=issue_label_string_fields) # inserting issue assignees # we are using issue_assignee_src_id and issue_id to determine if the label is already in the database. issue_assignee_natural_keys = ['issue_assignee_src_id', 'issue_id'] - augur_db.insert_data(issue_assignee_dicts, IssueAssignee, issue_assignee_natural_keys) + bulk_insert_dicts(logger, issue_assignee_dicts, IssueAssignee, issue_assignee_natural_keys) diff --git a/augur/tasks/github/issues/__init__.py b/augur/tasks/github/issues/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/augur/tasks/github/issues/core.py b/augur/tasks/github/issues/core.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/augur/tasks/github/messages/tasks.py b/augur/tasks/github/messages.py similarity index 65% rename from augur/tasks/github/messages/tasks.py rename to augur/tasks/github/messages.py index fc1776b2ed..7f1e63ea8c 100644 --- a/augur/tasks/github/messages/tasks.py +++ b/augur/tasks/github/messages.py @@ -1,22 +1,22 @@ import logging - +from datetime import timedelta, timezone from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.application.db.data_parse import * -from augur.tasks.github.util.github_paginator import GithubPaginator +from augur.tasks.github.util.github_data_access import GithubDataAccess, UrlNotFoundException from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.tasks.github.util.util import get_owner_repo -from augur.application.db.models import PullRequest, Message, Issue, PullRequestMessageRef, IssueMessageRef, Contributor, Repo - - +from augur.application.db.models import PullRequest, Message, Issue, PullRequestMessageRef, IssueMessageRef, Contributor, Repo, CollectionStatus +from augur.application.db import get_engine, get_session +from augur.application.db.lib import get_core_data_last_collected +from sqlalchemy.sql import text platform_id = 1 - @celery.task(base=AugurCoreRepoCollectionTask) -def collect_github_messages(repo_git: str) -> None: +def collect_github_messages(repo_git: str, full_collection: bool) -> None: logger = logging.getLogger(collect_github_messages.__name__) @@ -29,55 +29,108 @@ def collect_github_messages(repo_git: str) -> None: owner, repo = get_owner_repo(repo_git) task_name = f"{owner}/{repo}: Message Task" - message_data = retrieve_all_pr_and_issue_messages(repo_git, logger, manifest.key_auth, task_name) - - if message_data: + + if full_collection: + core_data_last_collected = None + else: + # subtract 2 days to ensure all data is collected + core_data_last_collected = (get_core_data_last_collected(repo_id) - timedelta(days=2)).replace(tzinfo=timezone.utc) + - process_messages(message_data, task_name, repo_id, logger, augur_db) + if is_repo_small(repo_id): + message_data = fast_retrieve_all_pr_and_issue_messages(repo_git, logger, manifest.key_auth, task_name, core_data_last_collected) + + if message_data: + process_messages(message_data, task_name, repo_id, logger, augur_db) + + else: + logger.info(f"{owner}/{repo} has no messages") else: - logger.info(f"{owner}/{repo} has no messages") + process_large_issue_and_pr_message_collection(repo_id, repo_git, logger, manifest.key_auth, task_name, augur_db, core_data_last_collected) + + +def is_repo_small(repo_id): + with get_session() as session: + result = session.query(CollectionStatus).filter(CollectionStatus.repo_id == repo_id, CollectionStatus.issue_pr_sum <= 10).first() -def retrieve_all_pr_and_issue_messages(repo_git: str, logger, key_auth, task_name) -> None: + return result != None + +def fast_retrieve_all_pr_and_issue_messages(repo_git: str, logger, key_auth, task_name, since) -> None: owner, repo = get_owner_repo(repo_git) # url to get issue and pull request comments url = f"https://api.github.com/repos/{owner}/{repo}/issues/comments" + if since: + url += f"?since={since.isoformat()}" + # define logger for task logger.info(f"Collecting github comments for {owner}/{repo}") + + github_data_access = GithubDataAccess(key_auth, logger) - # url to get issue and pull request comments - url = f"https://api.github.com/repos/{owner}/{repo}/issues/comments" + message_count = github_data_access.get_resource_count(url) - # define database task session, that also holds authentication keys the GithubPaginator needs - - # returns an iterable of all issues at this url (this essentially means you can treat the issues variable as a list of the issues) - messages = GithubPaginator(url, key_auth, logger) + logger.info(f"{task_name}: Collecting {message_count} github messages") - num_pages = messages.get_num_pages() - all_data = [] - for page_data, page in messages.iter_pages(): + return list(github_data_access.paginate_resource(url)) + + +def process_large_issue_and_pr_message_collection(repo_id, repo_git: str, logger, key_auth, task_name, augur_db, since) -> None: + + owner, repo = get_owner_repo(repo_git) - if page_data is None: - return all_data + # define logger for task + logger.info(f"Collecting github comments for {owner}/{repo}") - elif len(page_data) == 0: - logger.debug(f"{repo.capitalize()} Messages Page {page} contains no data...returning") - logger.info( - f"{task_name}: Page {page} of {num_pages}") - return all_data + engine = get_engine() - logger.info(f"{task_name}: Page {page} of {num_pages}") + with engine.connect() as connection: - all_data += page_data + if since: + query = text(f""" + (select pr_comments_url from pull_requests WHERE repo_id={repo_id} AND pr_updated_at > timestamptz(timestamp '{since}') order by pr_created_at desc) + UNION + (select comments_url as comment_url from issues WHERE repo_id={repo_id} AND updated_at > timestamptz(timestamp '{since}') order by created_at desc); + """) + else: + + query = text(f""" + (select pr_comments_url from pull_requests WHERE repo_id={repo_id} order by pr_created_at desc) + UNION + (select comments_url as comment_url from issues WHERE repo_id={repo_id} order by created_at desc); + """) - return all_data - + result = connection.execute(query).fetchall() + comment_urls = [x[0] for x in result] + + github_data_access = GithubDataAccess(key_auth, logger) + + logger.info(f"{task_name}: Collecting github messages for {len(comment_urls)} prs/issues") + + all_data = [] + for comment_url in comment_urls: + + try: + messages = list(github_data_access.paginate_resource(comment_url)) + except UrlNotFoundException as e: + logger.warning(e) + continue + + all_data += messages + + if len(all_data) >= 20: + process_messages(all_data, task_name, repo_id, logger, augur_db) + all_data.clear() + + if len(all_data) > 0: + process_messages(all_data, task_name, repo_id, logger, augur_db) + def process_messages(messages, task_name, repo_id, logger, augur_db): @@ -178,7 +231,7 @@ def process_messages(messages, task_name, repo_id, logger, augur_db): augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) logger.info(f"{task_name}: Inserting {len(message_dicts)} messages") - message_natural_keys = ["platform_msg_id"] + message_natural_keys = ["platform_msg_id", "pltfrm_id"] message_return_columns = ["msg_id", "platform_msg_id"] message_string_fields = ["msg_text"] message_return_data = augur_db.insert_data(message_dicts, Message, message_natural_keys, @@ -233,4 +286,4 @@ def process_github_comment_contributors(message, tool_source, tool_version, data # This is done by searching all the dicts for the given key that has the specified value def find_dict_in_list_of_dicts(data, key, value): - return next((item for item in data if item[key] == value), None) + return next((item for item in data if item[key] == value), None) \ No newline at end of file diff --git a/augur/tasks/github/messages/__init__.py b/augur/tasks/github/messages/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/augur/tasks/github/messages/core.py b/augur/tasks/github/messages/core.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/augur/tasks/github/pull_requests/commits_model/core.py b/augur/tasks/github/pull_requests/commits_model/core.py index ea91a597da..3f08fe9c01 100644 --- a/augur/tasks/github/pull_requests/commits_model/core.py +++ b/augur/tasks/github/pull_requests/commits_model/core.py @@ -1,22 +1,37 @@ import sqlalchemy as s -from augur.tasks.github.util.github_paginator import GithubPaginator +from augur.tasks.github.util.github_data_access import GithubDataAccess from augur.application.db.models import * from augur.tasks.github.util.util import get_owner_repo from augur.application.db.util import execute_session_query +from augur.application.db.lib import get_secondary_data_last_collected, get_updated_prs -def pull_request_commits_model(repo_id,logger, augur_db, key_auth): +def pull_request_commits_model(repo_id,logger, augur_db, key_auth, full_collection=False): - # query existing PRs and the respective url we will append the commits url to - pr_url_sql = s.sql.text(""" - SELECT DISTINCT pr_url, pull_requests.pull_request_id - FROM pull_requests--, pull_request_meta - WHERE repo_id = :repo_id - """).bindparams(repo_id=repo_id) - pr_urls = [] - #pd.read_sql(pr_number_sql, self.db, params={}) - - pr_urls = augur_db.fetchall_data_from_sql_text(pr_url_sql)#session.execute_sql(pr_number_sql).fetchall() + if full_collection: + # query existing PRs and the respective url we will append the commits url to + pr_url_sql = s.sql.text(""" + SELECT DISTINCT pr_url, pull_requests.pull_request_id + FROM pull_requests--, pull_request_meta + WHERE repo_id = :repo_id + """).bindparams(repo_id=repo_id) + pr_urls = [] + #pd.read_sql(pr_number_sql, self.db, params={}) + + pr_urls = augur_db.fetchall_data_from_sql_text(pr_url_sql)#session.execute_sql(pr_number_sql).fetchall() + + else: + last_collected = get_secondary_data_last_collected(repo_id).date() + prs = get_updated_prs(repo_id, last_collected) + pr_urls = [pr.pr_url for pr in prs] + + pr_urls = [] + for pr in prs: + pr_urls.append({ + 'pr_url': pr.pr_url, + 'pull_request_id': pr.pull_request_id + }) + query = augur_db.session.query(Repo).filter(Repo.repo_id == repo_id) repo = execute_session_query(query, 'one') @@ -26,33 +41,31 @@ def pull_request_commits_model(repo_id,logger, augur_db, key_auth): task_name = f"{owner}/{name} Pr commits" logger.info(f"Getting pull request commits for repo: {repo.repo_git}") + + github_data_access = GithubDataAccess(key_auth, logger) all_data = [] for index,pr_info in enumerate(pr_urls): logger.info(f'{task_name}: Querying commits for pull request #{index + 1} of {len(pr_urls)}') commits_url = pr_info['pr_url'] + '/commits?state=all' - - #Paginate through the pr commits - pr_commits = GithubPaginator(commits_url, key_auth, logger) - for page_data in pr_commits: - - if page_data: - logger.info(f"{task_name}: Processing pr commit with hash {page_data['sha']}") - pr_commit_row = { - 'pull_request_id': pr_info['pull_request_id'], - 'pr_cmt_sha': page_data['sha'], - 'pr_cmt_node_id': page_data['node_id'], - 'pr_cmt_message': page_data['commit']['message'], - # 'pr_cmt_comments_url': pr_commit['comments_url'], - 'tool_source': 'pull_request_commits_model', - 'tool_version': '0.41', - 'data_source': 'GitHub API', - 'repo_id': repo_id, - } - - all_data.append(pr_commit_row) + for page_data in github_data_access.paginate_resource(commits_url): + + logger.info(f"{task_name}: Processing pr commit with hash {page_data['sha']}") + pr_commit_row = { + 'pull_request_id': pr_info['pull_request_id'], + 'pr_cmt_sha': page_data['sha'], + 'pr_cmt_node_id': page_data['node_id'], + 'pr_cmt_message': page_data['commit']['message'], + # 'pr_cmt_comments_url': pr_commit['comments_url'], + 'tool_source': 'pull_request_commits_model', + 'tool_version': '0.41', + 'data_source': 'GitHub API', + 'repo_id': repo.repo_id, + } + + all_data.append(pr_commit_row) if len(all_data) > 0: logger.info(f"{task_name}: Inserting {len(all_data)} rows") diff --git a/augur/tasks/github/pull_requests/commits_model/tasks.py b/augur/tasks/github/pull_requests/commits_model/tasks.py index f0a065bdd1..e6acdfa90a 100644 --- a/augur/tasks/github/pull_requests/commits_model/tasks.py +++ b/augur/tasks/github/pull_requests/commits_model/tasks.py @@ -2,20 +2,18 @@ from augur.tasks.github.pull_requests.commits_model.core import * from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurSecondaryRepoCollectionTask -from augur.application.db.util import execute_session_query from augur.tasks.github.util.github_task_session import GithubTaskManifest +from augur.application.db.lib import get_repo_by_repo_git + @celery.task(base=AugurSecondaryRepoCollectionTask) -def process_pull_request_commits(repo_git: str) -> None: +def process_pull_request_commits(repo_git: str, full_collection: bool) -> None: logger = logging.getLogger(process_pull_request_commits.__name__) - with GithubTaskManifest(logger) as manifest: - - augur_db = manifest.augur_db + repo = get_repo_by_repo_git(repo_git) - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo = execute_session_query(query, 'one') + with GithubTaskManifest(logger) as manifest: - pull_request_commits_model(repo.repo_id, logger, augur_db, manifest.key_auth) + pull_request_commits_model(repo.repo_id, logger, manifest.augur_db, manifest.key_auth, full_collection) diff --git a/augur/tasks/github/pull_requests/core.py b/augur/tasks/github/pull_requests/core.py index 5bc86cd676..dd63edab6e 100644 --- a/augur/tasks/github/pull_requests/core.py +++ b/augur/tasks/github/pull_requests/core.py @@ -4,6 +4,7 @@ from augur.application.db.data_parse import * from augur.application.db.session import DatabaseSession +from augur.application.db.lib import bulk_insert_dicts, batch_insert_contributors from augur.tasks.github.util.util import add_key_value_pair_to_dicts from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.application.db.models import PullRequest, PullRequestLabel, PullRequestReviewer, PullRequestMeta, PullRequestAssignee, Contributor @@ -129,12 +130,12 @@ def extract_data_from_pr_list(pull_requests: List[dict], return pr_dicts, pr_mapping_data, pr_numbers, contributors -def insert_pr_contributors(contributors: List[dict], session: DatabaseSession, task_name: str) -> None: +def insert_pr_contributors(contributors: List[dict], logger, task_name: str) -> None: """Insert pr contributors Args: contributors: the contributor data that is being inserted - session: database session to insert the data with + logger task_name: to differiante between log statements since there are multiple tasks of the same type """ @@ -142,16 +143,16 @@ def insert_pr_contributors(contributors: List[dict], session: DatabaseSession, t contributors = remove_duplicate_dicts(contributors) # insert contributors from these prs - session.logger.info(f"{task_name}: Inserting {len(contributors)} contributors") - session.insert_data(contributors, Contributor, ["cntrb_id"]) + logger.info(f"{task_name}: Inserting {len(contributors)} contributors") + batch_insert_contributors(logger, contributors) -def insert_prs(pr_dicts: List[dict], session: DatabaseSession, task_name: str) -> Optional[List[dict]]: +def insert_prs(pr_dicts: List[dict], logger, task_name: str) -> Optional[List[dict]]: """Insert pull requests Args: pr_dicts: the pull request data that is being inserted - session: database session to insert the data with + logger task_name: to differiante between log statements since there are multiple tasks of the same type Returns: @@ -159,10 +160,10 @@ def insert_prs(pr_dicts: List[dict], session: DatabaseSession, task_name: str) - So we can determine what labels, assigness, and other data belong to each pr """ - session.logger.info(f"{task_name}: Inserting prs of length: {len(pr_dicts)}") + logger.info(f"{task_name}: Inserting prs of length: {len(pr_dicts)}") pr_natural_keys = ["pr_url"] pr_return_columns = ["pull_request_id", "pr_url"] - pr_return_data = session.insert_data(pr_dicts, PullRequest, pr_natural_keys, return_columns=pr_return_columns) + pr_return_data = bulk_insert_dicts(logger, pr_dicts, PullRequest, pr_natural_keys, return_columns=pr_return_columns) return pr_return_data @@ -211,7 +212,7 @@ def map_other_pr_data_to_pr( return pr_label_dicts, pr_assignee_dicts, pr_reviewer_dicts, pr_metadata_dicts -def insert_pr_labels(labels: List[dict], logger: logging.Logger, session) -> None: +def insert_pr_labels(labels: List[dict], logger: logging.Logger) -> None: """Insert pull request labels Note: @@ -223,10 +224,10 @@ def insert_pr_labels(labels: List[dict], logger: logging.Logger, session) -> Non """ # we are using pr_src_id and pull_request_id to determine if the label is already in the database. pr_label_natural_keys = ['pr_src_id', 'pull_request_id'] - session.insert_data(labels, PullRequestLabel, pr_label_natural_keys) + bulk_insert_dicts(labels, PullRequestLabel, pr_label_natural_keys) -def insert_pr_assignees(assignees: List[dict], logger: logging.Logger, session) -> None: +def insert_pr_assignees(assignees: List[dict], logger: logging.Logger) -> None: """Insert pull request assignees Note: @@ -238,10 +239,10 @@ def insert_pr_assignees(assignees: List[dict], logger: logging.Logger, session) """ # we are using pr_assignee_src_id and pull_request_id to determine if the label is already in the database. pr_assignee_natural_keys = ['pr_assignee_src_id', 'pull_request_id'] - session.insert_data(assignees, PullRequestAssignee, pr_assignee_natural_keys) + bulk_insert_dicts(logger, assignees, PullRequestAssignee, pr_assignee_natural_keys) -def insert_pr_reviewers(reviewers: List[dict], logger: logging.Logger, session) -> None: +def insert_pr_reviewers(reviewers: List[dict], logger: logging.Logger) -> None: """Insert pull request reviewers Note: @@ -253,10 +254,10 @@ def insert_pr_reviewers(reviewers: List[dict], logger: logging.Logger, session) """ # we are using pr_src_id and pull_request_id to determine if the label is already in the database. pr_reviewer_natural_keys = ["pull_request_id", "pr_reviewer_src_id"] - session.insert_data(reviewers, PullRequestReviewer, pr_reviewer_natural_keys) + bulk_insert_dicts(reviewers, PullRequestReviewer, pr_reviewer_natural_keys) -def insert_pr_metadata(metadata: List[dict], logger: logging.Logger, session) -> None: +def insert_pr_metadata(metadata: List[dict], logger: logging.Logger) -> None: """Insert pull request metadata Note: @@ -269,7 +270,7 @@ def insert_pr_metadata(metadata: List[dict], logger: logging.Logger, session) -> # inserting pr metadata # we are using pull_request_id, pr_head_or_base, and pr_sha to determine if the label is already in the database. pr_metadata_natural_keys = ['pull_request_id', 'pr_head_or_base', 'pr_sha'] - session.insert_data(metadata, PullRequestMeta, pr_metadata_natural_keys) + bulk_insert_dicts(logger, metadata, PullRequestMeta, pr_metadata_natural_keys) diff --git a/augur/tasks/github/pull_requests/files_model/core.py b/augur/tasks/github/pull_requests/files_model/core.py index 138aa61cb3..badc86cd38 100644 --- a/augur/tasks/github/pull_requests/files_model/core.py +++ b/augur/tasks/github/pull_requests/files_model/core.py @@ -1,41 +1,54 @@ import sqlalchemy as s -from augur.tasks.github.util.gh_graphql_entities import GraphQlPageCollection +from augur.tasks.github.util.github_graphql_data_access import GithubGraphQlDataAccess, NotFoundException, InvalidDataException from augur.application.db.models import * from augur.tasks.github.util.util import get_owner_repo from augur.application.db.util import execute_session_query +from augur.application.db.lib import get_secondary_data_last_collected, get_updated_prs -def pull_request_files_model(repo_id,logger, augur_db, key_auth): + +def pull_request_files_model(repo_id,logger, augur_db, key_auth, full_collection=False): - # query existing PRs and the respective url we will append the commits url to - pr_number_sql = s.sql.text(""" - SELECT DISTINCT pr_src_number as pr_src_number, pull_requests.pull_request_id - FROM pull_requests--, pull_request_meta - WHERE repo_id = :repo_id - """).bindparams(repo_id=repo_id) - pr_numbers = [] - #pd.read_sql(pr_number_sql, self.db, params={}) + if full_collection: + # query existing PRs and the respective url we will append the commits url to + pr_number_sql = s.sql.text(""" + SELECT DISTINCT pr_src_number as pr_src_number, pull_requests.pull_request_id + FROM pull_requests--, pull_request_meta + WHERE repo_id = :repo_id + """).bindparams(repo_id=repo_id) + pr_numbers = [] + #pd.read_sql(pr_number_sql, self.db, params={}) + + result = augur_db.execute_sql(pr_number_sql)#.fetchall() + pr_numbers = [dict(row) for row in result.mappings()] + + else: + last_collected = get_secondary_data_last_collected(repo_id).date() + prs = get_updated_prs(repo_id, last_collected) - result = augur_db.execute_sql(pr_number_sql)#.fetchall() - pr_numbers = [dict(row) for row in result.mappings()] + pr_numbers = [] + for pr in prs: + pr_numbers.append({ + 'pr_src_number': pr.pr_src_number, + 'pull_request_id': pr.pull_request_id + }) query = augur_db.session.query(Repo).filter(Repo.repo_id == repo_id) repo = execute_session_query(query, 'one') - owner, name = get_owner_repo(repo.repo_git) + github_graphql_data_access = GithubGraphQlDataAccess(key_auth, logger) + pr_file_rows = [] logger.info(f"Getting pull request files for repo: {repo.repo_git}") - for index,pr_info in enumerate(pr_numbers): + for index, pr_info in enumerate(pr_numbers): logger.info(f'Querying files for pull request #{index + 1} of {len(pr_numbers)}') query = """ - query($repo: String!, $owner: String!,$pr_number: Int!, $numRecords: Int!, $cursor: String) { repository(name: $repo, owner: $owner) { pullRequest(number: $pr_number) { - files ( first: $numRecords, after: $cursor) - { + files ( first: $numRecords, after: $cursor) { edges { node { additions @@ -54,28 +67,34 @@ def pull_request_files_model(repo_id,logger, augur_db, key_auth): } """ - values = ("repository","pullRequest","files") + values = ["repository", "pullRequest", "files"] params = { - 'owner' : owner, - 'repo' : name, - 'pr_number' : pr_info['pr_src_number'], - 'values' : values + 'owner': owner, + 'repo': name, + 'pr_number': pr_info['pr_src_number'], } - - file_collection = GraphQlPageCollection(query, key_auth, logger,bind=params) + try: + for pr_file in github_graphql_data_access.paginate_resource(query, params, values): - pr_file_rows += [{ - 'pull_request_id': pr_info['pull_request_id'], - 'pr_file_additions': pr_file['additions'] if 'additions' in pr_file else None, - 'pr_file_deletions': pr_file['deletions'] if 'deletions' in pr_file else None, - 'pr_file_path': pr_file['path'], - 'data_source': 'GitHub API', - 'repo_id': repo_id, - } for pr_file in file_collection if pr_file and 'path' in pr_file] + if not pr_file or 'path' not in pr_file: + continue + + data = { + 'pull_request_id': pr_info['pull_request_id'], + 'pr_file_additions': pr_file['additions'] if 'additions' in pr_file else None, + 'pr_file_deletions': pr_file['deletions'] if 'deletions' in pr_file else None, + 'pr_file_path': pr_file['path'], + 'data_source': 'GitHub API', + 'repo_id': repo.repo_id, + } + pr_file_rows.append(data) + except (NotFoundException, InvalidDataException) as e: + logger.warning(e) + continue if len(pr_file_rows) > 0: - #Execute a bulk upsert with sqlalchemy + # Execute a bulk upsert with sqlalchemy pr_file_natural_keys = ["pull_request_id", "repo_id", "pr_file_path"] augur_db.insert_data(pr_file_rows, PullRequestFile, pr_file_natural_keys) diff --git a/augur/tasks/github/pull_requests/files_model/tasks.py b/augur/tasks/github/pull_requests/files_model/tasks.py index 988261f6c8..be75c88a9d 100644 --- a/augur/tasks/github/pull_requests/files_model/tasks.py +++ b/augur/tasks/github/pull_requests/files_model/tasks.py @@ -6,7 +6,7 @@ from augur.application.db.util import execute_session_query @celery.task(base=AugurSecondaryRepoCollectionTask) -def process_pull_request_files(repo_git: str) -> None: +def process_pull_request_files(repo_git: str, full_collection: bool) -> None: logger = logging.getLogger(process_pull_request_files.__name__) @@ -15,4 +15,4 @@ def process_pull_request_files(repo_git: str) -> None: query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) repo = execute_session_query(query, 'one') - pull_request_files_model(repo.repo_id, logger, augur_db, manifest.key_auth) \ No newline at end of file + pull_request_files_model(repo.repo_id, logger, augur_db, manifest.key_auth, full_collection) \ No newline at end of file diff --git a/augur/tasks/github/pull_requests/tasks.py b/augur/tasks/github/pull_requests/tasks.py index 37f59e5210..812a4eef25 100644 --- a/augur/tasks/github/pull_requests/tasks.py +++ b/augur/tasks/github/pull_requests/tasks.py @@ -1,23 +1,28 @@ import logging +from datetime import datetime, timedelta, timezone from augur.tasks.github.pull_requests.core import extract_data_from_pr_list from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask, AugurSecondaryRepoCollectionTask from augur.application.db.data_parse import * -from augur.tasks.github.util.github_paginator import GithubPaginator -from augur.tasks.github.util.github_task_session import GithubTaskManifest +from augur.tasks.github.util.github_data_access import GithubDataAccess, UrlNotFoundException from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.tasks.github.util.util import add_key_value_pair_to_dicts, get_owner_repo from augur.application.db.models import PullRequest, Message, PullRequestReview, PullRequestLabel, PullRequestReviewer, PullRequestMeta, PullRequestAssignee, PullRequestReviewMessageRef, Contributor, Repo +from augur.tasks.github.util.github_task_session import GithubTaskManifest +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth +from augur.application.db.lib import get_session, get_repo_by_repo_git, bulk_insert_dicts, get_pull_request_reviews_by_repo_id, batch_insert_contributors from augur.application.db.util import execute_session_query -from ..messages.tasks import process_github_comment_contributors +from ..messages import process_github_comment_contributors +from augur.application.db.lib import get_secondary_data_last_collected, get_updated_prs, get_core_data_last_collected +from typing import Generator, List, Dict -platform_id = 1 +platform_id = 1 @celery.task(base=AugurCoreRepoCollectionTask) -def collect_pull_requests(repo_git: str) -> int: +def collect_pull_requests(repo_git: str, full_collection: bool) -> int: logger = logging.getLogger(collect_pull_requests.__name__) @@ -29,48 +34,61 @@ def collect_pull_requests(repo_git: str) -> int: Repo.repo_git == repo_git).one().repo_id owner, repo = get_owner_repo(repo_git) - pr_data = retrieve_all_pr_data(repo_git, logger, manifest.key_auth) - if pr_data: - process_pull_requests(pr_data, f"{owner}/{repo}: Pr task", repo_id, logger, augur_db) + if full_collection: + core_data_last_collected = None + else: + # subtract 2 days to ensure all data is collected + core_data_last_collected = (get_core_data_last_collected(repo_id) - timedelta(days=2)).replace(tzinfo=timezone.utc) + + total_count = 0 + all_data = [] + for pr in retrieve_all_pr_data(repo_git, logger, manifest.key_auth, core_data_last_collected): + + all_data.append(pr) + + if len(all_data) >= 1000: + process_pull_requests(all_data, f"{owner}/{repo}: Github Pr task", repo_id, logger, augur_db) + total_count += len(all_data) + all_data.clear() + + if len(all_data): + process_pull_requests(all_data, f"{owner}/{repo}: Github Pr task", repo_id, logger, augur_db) + total_count += len(all_data) - return len(pr_data) + if total_count > 0: + return total_count else: - logger.info(f"{owner}/{repo} has no pull requests") + logger.debug(f"{owner}/{repo} has no pull requests") return 0 + # TODO: Rename pull_request_reviewers table to pull_request_requested_reviewers # TODO: Fix column names in pull request labels table -def retrieve_all_pr_data(repo_git: str, logger, key_auth) -> None: +def retrieve_all_pr_data(repo_git: str, logger, key_auth, since): #-> Generator[List[Dict]]: owner, repo = get_owner_repo(repo_git) - logger.info(f"Collecting pull requests for {owner}/{repo}") + logger.debug(f"Collecting pull requests for {owner}/{repo}") - url = f"https://api.github.com/repos/{owner}/{repo}/pulls?state=all&direction=desc" - # returns an iterable of all prs at this url (this essentially means you can treat the prs variable as a list of the prs) - prs = GithubPaginator(url, key_auth, logger) + url = f"https://api.github.com/repos/{owner}/{repo}/pulls?state=all&direction=desc&sort=updated" - all_data = [] - num_pages = prs.get_num_pages() - for page_data, page in prs.iter_pages(): + github_data_access = GithubDataAccess(key_auth, logger) - if page_data is None: - return all_data + num_pages = github_data_access.get_resource_page_count(url) - if len(page_data) == 0: - logger.debug( - f"{owner}/{repo} Prs Page {page} contains no data...returning") - logger.info(f"{owner}/{repo} Prs Page {page} of {num_pages}") - return all_data + logger.debug(f"{owner}/{repo}: Retrieving {num_pages} pages of pull requests") - logger.info(f"{owner}/{repo} Prs Page {page} of {num_pages}") + # returns a generator so this method can be used by doing for x in retrieve_all_pr_data() - all_data += page_data + for pr in github_data_access.paginate_resource(url): - return all_data + yield pr + # return if last pr on the page was updated before the since date + if since and datetime.fromisoformat(pr["updated_at"].replace("Z", "+00:00")).replace(tzinfo=timezone.utc) < since: + return def process_pull_requests(pull_requests, task_name, repo_id, logger, augur_db): """ @@ -177,6 +195,11 @@ def process_pull_requests(pull_requests, task_name, repo_id, logger, augur_db): + + + + + @@ -195,129 +218,117 @@ def process_pull_request_review_contributor(pr_review: dict, tool_source: str, t return pr_review_cntrb - @celery.task(base=AugurSecondaryRepoCollectionTask) -def collect_pull_request_review_comments(repo_git: str) -> None: +def collect_pull_request_review_comments(repo_git: str, full_collection: bool) -> None: owner, repo = get_owner_repo(repo_git) review_msg_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/comments" logger = logging.getLogger(collect_pull_request_review_comments.__name__) - logger.info(f"Collecting pull request review comments for {owner}/{repo}") - - # define GithubTaskSession to handle insertions, and store oauth keys - with GithubTaskManifest(logger) as manifest: - - augur_db = manifest.augur_db - - # get repo_id - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_id = execute_session_query(query, 'one').repo_id + logger.debug(f"Collecting pull request review comments for {owner}/{repo}") - query = augur_db.session.query(PullRequestReview).filter(PullRequestReview.repo_id == repo_id) - pr_reviews = execute_session_query(query, 'all') + repo_id = get_repo_by_repo_git(repo_git).repo_id - # maps the github pr_review id to the auto incrementing pk that augur stores as pr_review id - pr_review_id_mapping = {} - for review in pr_reviews: - pr_review_id_mapping[review.pr_review_src_id] = review.pr_review_id - - - tool_source = "Pr review comment task" - tool_version = "2.0" - data_source = "Github API" + if not full_collection: + last_collected_date = get_secondary_data_last_collected(repo_id) + + if last_collected_date: + # subtract 2 days to ensure all data is collected + core_data_last_collected = (last_collected_date - timedelta(days=2)).replace(tzinfo=timezone.utc) + review_msg_url += f"?since={core_data_last_collected.isoformat()}" + else: + logger.warning(f"core_data_last_collected is NULL for recollection on repo: {repo_git}") - pr_review_messages = GithubPaginator(review_msg_url, manifest.key_auth, logger) - num_pages = pr_review_messages.get_num_pages() + pr_reviews = get_pull_request_reviews_by_repo_id(repo_id) - all_raw_pr_review_messages = [] - for page_data, page in pr_review_messages.iter_pages(): + # maps the github pr_review id to the auto incrementing pk that augur stores as pr_review id + pr_review_id_mapping = {} + for review in pr_reviews: + pr_review_id_mapping[review.pr_review_src_id] = review.pr_review_id - if page_data is None: - break - if len(page_data) == 0: - logger.debug(f"{owner}/{repo} Pr Review Messages Page {page} contains no data...returning") - logger.info(f"{owner}/{repo} Pr Review Messages Page {page} of {num_pages}") - break + tool_source = "Pr review comment task" + tool_version = "2.0" + data_source = "Github API" - logger.info(f"{owner}/{repo} Pr Review Messages Page {page} of {num_pages}") + key_auth = GithubRandomKeyAuth(logger) + github_data_access = GithubDataAccess(key_auth, logger) - all_raw_pr_review_messages += page_data + all_raw_pr_review_messages = list(github_data_access.paginate_resource(review_msg_url)) - contributors = [] - for comment in all_raw_pr_review_messages: - - _, contributor = process_github_comment_contributors(comment, tool_source, tool_version, data_source) - if contributor is not None: - contributors.append(contributor) + contributors = [] + for comment in all_raw_pr_review_messages: + + _, contributor = process_github_comment_contributors(comment, tool_source, tool_version, data_source) + if contributor is not None: + contributors.append(contributor) - logger.info(f"{owner}/{repo} Pr review messages: Inserting {len(contributors)} contributors") - augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) + logger.info(f"{owner}/{repo} Pr review messages: Inserting {len(contributors)} contributors") + batch_insert_contributors(logger, contributors) - pr_review_comment_dicts = [] - pr_review_msg_mapping_data = {} + pr_review_comment_dicts = [] + pr_review_msg_mapping_data = {} - pr_review_comments_len = len(all_raw_pr_review_messages) - logger.info(f"{owner}/{repo}: Pr review comments len: {pr_review_comments_len}") - for index, comment in enumerate(all_raw_pr_review_messages): + pr_review_comments_len = len(all_raw_pr_review_messages) + for comment in all_raw_pr_review_messages: - # pull_request_review_id is required to map it to the correct pr review - if not comment["pull_request_review_id"]: - continue + # pull_request_review_id is required to map it to the correct pr review + if not comment["pull_request_review_id"]: + continue - pr_review_comment_dicts.append( - extract_needed_message_data(comment, platform_id, repo_id, tool_source, tool_version, data_source) - ) + pr_review_comment_dicts.append( + extract_needed_message_data(comment, platform_id, repo_id, tool_source, tool_version, data_source) + ) - # map github message id to the data that maps it to the pr review - github_msg_id = comment["id"] - pr_review_msg_mapping_data[github_msg_id] = comment + # map github message id to the data that maps it to the pr review + github_msg_id = comment["id"] + pr_review_msg_mapping_data[github_msg_id] = comment - logger.info(f"Inserting {len(pr_review_comment_dicts)} pr review comments") - message_natural_keys = ["platform_msg_id"] - message_return_columns = ["msg_id", "platform_msg_id"] - message_return_data = augur_db.insert_data(pr_review_comment_dicts, Message, message_natural_keys, message_return_columns) - if message_return_data is None: - return + logger.info(f"Inserting {len(pr_review_comment_dicts)} pr review comments") + message_natural_keys = ["platform_msg_id", "pltfrm_id"] + message_return_columns = ["msg_id", "platform_msg_id"] + message_string_fields = ["msg_text"] + message_return_data = bulk_insert_dicts(logger, pr_review_comment_dicts, Message, message_natural_keys, + return_columns=message_return_columns, string_fields=message_string_fields) + if message_return_data is None: + return - pr_review_message_ref_insert_data = [] - for data in message_return_data: + pr_review_message_ref_insert_data = [] + for data in message_return_data: - augur_msg_id = data["msg_id"] - github_msg_id = data["platform_msg_id"] + augur_msg_id = data["msg_id"] + github_msg_id = data["platform_msg_id"] - comment = pr_review_msg_mapping_data[github_msg_id] - comment["msg_id"] = augur_msg_id + comment = pr_review_msg_mapping_data[github_msg_id] + comment["msg_id"] = augur_msg_id - github_pr_review_id = comment["pull_request_review_id"] + github_pr_review_id = comment["pull_request_review_id"] - try: - augur_pr_review_id = pr_review_id_mapping[github_pr_review_id] - except KeyError: - logger.info(f"{owner}/{repo}: Could not find related pr review") - logger.info(f"{owner}/{repo}: We were searching for pr review with id: {github_pr_review_id}") - logger.info("Skipping") - continue + try: + augur_pr_review_id = pr_review_id_mapping[github_pr_review_id] + except KeyError: + logger.warning(f"{owner}/{repo}: Could not find related pr review. We were searching for pr review with id: {github_pr_review_id}") + continue - pr_review_message_ref = extract_pr_review_message_ref_data(comment, augur_pr_review_id, github_pr_review_id, repo_id, tool_version, data_source) - pr_review_message_ref_insert_data.append(pr_review_message_ref) + pr_review_message_ref = extract_pr_review_message_ref_data(comment, augur_pr_review_id, github_pr_review_id, repo_id, tool_version, data_source) + pr_review_message_ref_insert_data.append(pr_review_message_ref) - logger.info(f"Inserting {len(pr_review_message_ref_insert_data)} pr review refs") - pr_comment_ref_natural_keys = ["pr_review_msg_src_id"] - augur_db.insert_data(pr_review_message_ref_insert_data, PullRequestReviewMessageRef, pr_comment_ref_natural_keys) + logger.info(f"Inserting {len(pr_review_message_ref_insert_data)} pr review refs") + pr_comment_ref_natural_keys = ["pr_review_msg_src_id"] + pr_review_msg_ref_string_columns = ["pr_review_msg_diff_hunk"] + bulk_insert_dicts(logger, pr_review_message_ref_insert_data, PullRequestReviewMessageRef, pr_comment_ref_natural_keys, string_fields=pr_review_msg_ref_string_columns) @celery.task(base=AugurSecondaryRepoCollectionTask) -def collect_pull_request_reviews(repo_git: str) -> None: +def collect_pull_request_reviews(repo_git: str, full_collection: bool) -> None: logger = logging.getLogger(collect_pull_request_reviews.__name__) @@ -327,6 +338,7 @@ def collect_pull_request_reviews(repo_git: str) -> None: tool_source = "pull_request_reviews" data_source = "Github API" + repo_id = get_repo_by_repo_git(repo_git).repo_id with GithubTaskManifest(logger) as manifest: augur_db = manifest.augur_db @@ -334,65 +346,64 @@ def collect_pull_request_reviews(repo_git: str) -> None: query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) repo_id = execute_session_query(query, 'one').repo_id - query = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).order_by(PullRequest.pr_src_number) - prs = execute_session_query(query, 'all') + if full_collection: + + query = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).order_by(PullRequest.pr_src_number) + prs = execute_session_query(query, 'all') + else: + last_collected = get_secondary_data_last_collected(repo_id).date() + prs = get_updated_prs(repo_id, last_collected) pr_count = len(prs) + github_data_access = GithubDataAccess(manifest.key_auth, logger) + all_pr_reviews = {} for index, pr in enumerate(prs): pr_number = pr.pr_src_number pull_request_id = pr.pull_request_id - logger.info(f"{owner}/{repo} Collecting Pr Reviews for pr {index + 1} of {pr_count}") + logger.debug(f"{owner}/{repo} Collecting Pr Reviews for pr {index + 1} of {pr_count}") pr_review_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}/reviews" - pr_reviews = [] - pr_reviews_generator = GithubPaginator(pr_review_url, manifest.key_auth, logger) - for page_data, page in pr_reviews_generator.iter_pages(): - - if page_data is None: - break - - if len(page_data) == 0: - break + try: + pr_reviews = list(github_data_access.paginate_resource(pr_review_url)) + except UrlNotFoundException as e: + logger.warning(e) + continue - pr_reviews.extend(page_data) - if pr_reviews: all_pr_reviews[pull_request_id] = pr_reviews if not list(all_pr_reviews.keys()): - logger.info(f"{owner}/{repo} No pr reviews for repo") + logger.debug(f"{owner}/{repo} No pr reviews for repo") return contributors = [] - for pull_request_id in all_pr_reviews.keys(): + for pull_request_id, reviews in all_pr_reviews.items(): - reviews = all_pr_reviews[pull_request_id] for review in reviews: contributor = process_pull_request_review_contributor(review, tool_source, tool_version, data_source) if contributor: contributors.append(contributor) - logger.info(f"{owner}/{repo} Pr reviews: Inserting {len(contributors)} contributors") - augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) + logger.info(f"{owner}/{repo} Pr reviews: Inserting {len(contributors)} contributors") + augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) pr_reviews = [] - for pull_request_id in all_pr_reviews.keys(): + for pull_request_id, reviews in all_pr_reviews.items(): - reviews = all_pr_reviews[pull_request_id] for review in reviews: if "cntrb_id" in review: pr_reviews.append(extract_needed_pr_review_data(review, pull_request_id, repo_id, platform_id, tool_source, tool_version)) - logger.info(f"{owner}/{repo}: Inserting pr reviews of length: {len(pr_reviews)}") - pr_review_natural_keys = ["pr_review_src_id",] - augur_db.insert_data(pr_reviews, PullRequestReview, pr_review_natural_keys) + logger.info(f"{owner}/{repo}: Inserting pr reviews of length: {len(pr_reviews)}") + pr_review_natural_keys = ["pr_review_src_id",] + augur_db.insert_data(pr_reviews, PullRequestReview, pr_review_natural_keys) diff --git a/augur/tasks/github/releases/core.py b/augur/tasks/github/releases/core.py index b7f953c618..3192401ae3 100644 --- a/augur/tasks/github/releases/core.py +++ b/augur/tasks/github/releases/core.py @@ -4,6 +4,7 @@ from augur.tasks.github.util.util import get_owner_repo from augur.tasks.github.util.gh_graphql_entities import request_graphql_dict from augur.application.db.util import execute_session_query +from augur.application.db.lib import bulk_insert_dicts def get_release_inf(repo_id, release, tag_only): @@ -63,11 +64,11 @@ def get_release_inf(repo_id, release, tag_only): return release_inf -def insert_release(augur_db, logger, repo_id, owner, release, tag_only = False): +def insert_release(session, logger, repo_id, owner, release, tag_only = False): # Get current table values logger.info('Getting release table values\n') - query = augur_db.session.query(Release.release_id).filter(Release.repo_id == repo_id) + query = session.query(Release.release_id).filter(Release.repo_id == repo_id) release_id_data = execute_session_query(query, 'all')#pd.read_sql(release_id_data_sql, self.db, params={'repo_id': repo_id}) release_id_data = [str(r_id).strip() for r_id in release_id_data]#release_id_data.apply(lambda x: x.str.strip()) @@ -77,7 +78,7 @@ def insert_release(augur_db, logger, repo_id, owner, release, tag_only = False): #Do an upsert string_fields = ["release_name", "release_description", "release_author", "release_tag_name"] - augur_db.insert_data(release_inf,Release,['release_id'], string_fields=string_fields) + bulk_insert_dicts(logger, release_inf,Release,['release_id'], string_fields=string_fields) logger.info(f"Inserted info for {owner}/{repo_id}/{release['name']}\n") @@ -166,7 +167,7 @@ def fetch_data(key_auth, logger, github_url, repo_id, tag_only = False): return data -def releases_model(augur_db, key_auth, logger, repo_git, repo_id): +def releases_model(session, key_auth, logger, repo_git, repo_id): try: data = fetch_data(key_auth, logger, repo_git, repo_id) @@ -181,7 +182,7 @@ def releases_model(augur_db, key_auth, logger, repo_git, repo_id): if 'node' in n: release = n['node'] #self.insert_release(task, repo_id, data['owner'], release) - insert_release(augur_db, logger, repo_id, data['owner'], release) + insert_release(session, logger, repo_id, data['owner'], release) else: logger.info("There's no release to insert. Current node is not available in releases: {}\n".format(n)) elif 'edges' in data['releases'] and not data['releases']['edges']: @@ -194,7 +195,7 @@ def releases_model(augur_db, key_auth, logger, repo_git, repo_id): if 'node' in n: release = n['node'] #self.insert_release(task, repo_id, data['owner'], release, True) - insert_release(augur_db,logger, repo_id, data['owner'], release, True) + insert_release(session, logger, repo_id, data['owner'], release, True) else: logger.info("There's no release to insert. Current node is not available in releases: {}\n".format(n)) else: diff --git a/augur/tasks/github/releases/tasks.py b/augur/tasks/github/releases/tasks.py index 310da90d74..3e2210a7c9 100644 --- a/augur/tasks/github/releases/tasks.py +++ b/augur/tasks/github/releases/tasks.py @@ -1,22 +1,22 @@ import logging -from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.tasks.github.releases.core import * from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask -from augur.application.db.util import execute_session_query +from augur.application.db.lib import get_repo_by_repo_git, get_session +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth + @celery.task(base=AugurCoreRepoCollectionTask) def collect_releases(repo_git): logger = logging.getLogger(collect_releases.__name__) - with GithubTaskManifest(logger) as manifest: - augur_db = manifest.augur_db + repo_obj = get_repo_by_repo_git(repo_git) + repo_id = repo_obj.repo_id - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id + key_auth = GithubRandomKeyAuth(logger) + with get_session() as session: - releases_model(augur_db, manifest.key_auth, logger, repo_git, repo_id) \ No newline at end of file + releases_model(session, key_auth, logger, repo_git, repo_id) \ No newline at end of file diff --git a/augur/tasks/github/repo_info/core.py b/augur/tasks/github/repo_info/core.py index 2a9f21af72..5a8710e31b 100644 --- a/augur/tasks/github/repo_info/core.py +++ b/augur/tasks/github/repo_info/core.py @@ -1,23 +1,41 @@ #SPDX-License-Identifier: MIT import json import sqlalchemy as s -from augur.tasks.github.util.github_paginator import GithubPaginator +from augur.tasks.github.util.github_data_access import GithubDataAccess +from augur.tasks.github.util.github_graphql_data_access import GithubGraphQlDataAccess from augur.tasks.github.util.github_paginator import hit_api from augur.tasks.github.util.util import get_owner_repo from augur.tasks.github.util.gh_graphql_entities import request_graphql_dict from augur.application.db.models import * +from augur.application.db.lib import execute_sql from augur.tasks.github.util.github_task_session import * from augur.application.db.models.augur_data import RepoBadging from urllib.parse import quote def query_committers_count(key_auth, logger, owner, repo): + data = {} logger.info('Querying committers count\n') url = f'https://api.github.com/repos/{owner}/{repo}/contributors?per_page=100' - - contributors = GithubPaginator(url, key_auth, logger) + ## If the repository is empty there are zero committers, and the API returns nothing at all. Response + ## header of 200 along with an empty JSON. + try: + github_data_access = GithubDataAccess(key_auth, logger) + try: + data = github_data_access.get_resource_count(url) + except Exception as e: + logger.warning(f"JSON Decode error: {e} indicating there are no committers or the repository is empty or archived.") + data = 0 + pass + if not data: + logger.warning("The API Returned an empty JSON object.") + else: + logger.warning("Committer count data returned in JSON") + except ValueError: + logger.warning("The API did not return valid JSON for committer count. This usually occurs on empty or archived repositories.") + data=0 - return len(contributors) + return data def get_repo_data(logger, url, response): data = {} @@ -92,128 +110,79 @@ def grab_repo_info_from_graphql_endpoint(key_auth, logger, query): return data -def repo_info_model(augur_db, key_auth, repo_orm_obj, logger): +def repo_info_model(key_auth, repo_orm_obj, logger): logger.info("Beginning filling the repo_info model for repo: " + repo_orm_obj.repo_git + "\n") owner, repo = get_owner_repo(repo_orm_obj.repo_git) - url = 'https://api.github.com/graphql' - - query = """ - { - repository(owner:"%s", name:"%s"){ - updatedAt - hasIssuesEnabled - issues(states:OPEN) { + query = """query($repo: String!, $owner: String!) { + repository(name: $repo, owner: $owner) { + updatedAt + hasIssuesEnabled + issues(states: OPEN) { totalCount - } - hasWikiEnabled - forkCount - defaultBranchRef { + } + hasWikiEnabled + forkCount + defaultBranchRef { name - } - watchers { + } + watchers { totalCount - } - id - licenseInfo { + } + id + licenseInfo { name url - } - stargazers { + } + stargazers { totalCount - } - codeOfConduct { + } + codeOfConduct { name url - } - issue_count: issues { + } + issue_count: issues { totalCount - } - issues_closed: issues(states:CLOSED) { + } + issues_closed: issues(states: CLOSED) { totalCount - } - pr_count: pullRequests { + } + pr_count: pullRequests { totalCount - } - pr_open: pullRequests(states: OPEN) { + } + pr_open: pullRequests(states: OPEN) { totalCount - } - pr_closed: pullRequests(states: CLOSED) { + } + pr_closed: pullRequests(states: CLOSED) { totalCount - } - pr_merged: pullRequests(states: MERGED) { + } + pr_merged: pullRequests(states: MERGED) { totalCount - } - defaultBranchRef { + } + defaultBranchRef { target { ... on Commit { - history { - totalCount - } + history { + totalCount + } } } + } } - } - } - - """ % (owner, repo) - - ############################## - # { - # repository(owner: "chaoss", name: "augur") { - # updatedAt - # hasIssuesEnabled - # issues(states: OPEN) { - # totalCount - # } - # hasWikiEnabled - # forkCount - # defaultBranchRef { - # name - # } - # watchers { - # totalCount - # } - # id - # licenseInfo { - # name - # url - # } - # stargazers { - # totalCount - # } - # codeOfConduct { - # name - # url - # } - # issue_count: issues { - # totalCount - # } - # issues_closed: issues(states: CLOSED) { - # totalCount - # } - # pr_count: pullRequests { - # totalCount - # } - # pr_open: pullRequests(states: OPEN) { - # totalCount - # } - # pr_closed: pullRequests(states: CLOSED) { - # totalCount - # } - # pr_merged: pullRequests(states: MERGED) { - # totalCount - # } - # stargazerCount - # } - # } + } + """ + + github_graphql_data_access = GithubGraphQlDataAccess(key_auth, logger) - try: - data = grab_repo_info_from_graphql_endpoint(key_auth, logger, query) - except Exception as e: - logger.error(f"Could not grab info for repo {repo_orm_obj.repo_id}") - raise e + variables = { + "owner": owner, + "repo": repo + } + + result_keys = ["repository"] + + data = github_graphql_data_access.get_resource(query, variables, result_keys) # Get committers count info that requires seperate endpoint committers_count = query_committers_count(key_auth, logger, owner, repo) @@ -255,7 +224,7 @@ def repo_info_model(augur_db, key_auth, repo_orm_obj, logger): 'data_source': "Github" } - #result = session.insert_data(rep_inf,RepoInfo,['repo_info_id']) #result = self.db.execute(self.repo_info_table.insert().values(rep_inf)) + #result = bulk_insert_dicts(rep_inf,RepoInfo,['repo_info_id']) #result = self.db.execute(self.repo_info_table.insert().values(rep_inf)) insert_statement = s.sql.text("""INSERT INTO repo_info (repo_id,last_updated,issues_enabled, open_issues,pull_requests_enabled,wiki_enabled,pages_enabled,fork_count, default_branch,watchers_count,license,stars_count, @@ -270,7 +239,7 @@ def repo_info_model(augur_db, key_auth, repo_orm_obj, logger): :tool_source, :tool_version, :data_source) """).bindparams(**rep_inf) - augur_db.execute_sql(insert_statement) + execute_sql(insert_statement) # Note that the addition of information about where a repository may be forked from, and whether a repository is archived, updates the `repo` table, not the `repo_info` table. forked = is_forked(key_auth, logger, owner, repo) @@ -283,7 +252,7 @@ def repo_info_model(augur_db, key_auth, repo_orm_obj, logger): archived = 0 update_repo_data = s.sql.text("""UPDATE repo SET forked_from=:forked, repo_archived=:archived, repo_archived_date_collected=:archived_date_collected WHERE repo_id=:repo_id""").bindparams(forked=forked, archived=archived, archived_date_collected=archived_date_collected, repo_id=repo_orm_obj.repo_id) - augur_db.execute_sql(update_repo_data) + execute_sql(update_repo_data) logger.info(f"Inserted info for {owner}/{repo}\n") diff --git a/augur/tasks/github/repo_info/tasks.py b/augur/tasks/github/repo_info/tasks.py index b31bc7bf62..85d639d2a6 100644 --- a/augur/tasks/github/repo_info/tasks.py +++ b/augur/tasks/github/repo_info/tasks.py @@ -1,10 +1,12 @@ import logging -from augur.tasks.github.util.github_task_session import GithubTaskManifest +from augur.application.db.session import DatabaseSession from augur.tasks.github.repo_info.core import * from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask -from augur.application.db.util import execute_session_query +from augur.application.db.lib import get_repo_by_repo_git +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth +from augur.application.db import get_engine #Task to get regular misc github info @@ -13,23 +15,23 @@ def collect_repo_info(repo_git: str): logger = logging.getLogger(collect_repo_info.__name__) - with GithubTaskManifest(logger) as manifest: - augur_db = manifest.augur_db - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo = execute_session_query(query, 'one') - - repo_info_model(augur_db, manifest.key_auth, repo, logger) + repo = get_repo_by_repo_git(repo_git) + + key_auth = GithubRandomKeyAuth(logger) + + repo_info_model(key_auth, repo, logger) #Task to get CII api data for linux badge info using github data. @celery.task(base=AugurCoreRepoCollectionTask) def collect_linux_badge_info(repo_git: str): + engine = get_engine() + logger = logging.getLogger(collect_linux_badge_info.__name__) - with GithubTaskManifest(logger) as manifest: - augur_db = manifest.augur_db - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo = execute_session_query(query, 'one') + repo = get_repo_by_repo_git(repo_git) + + with DatabaseSession(logger, engine=engine) as session: - badges_model(logger, repo_git, repo.repo_id, augur_db) + badges_model(logger, repo_git, repo.repo_id, session) diff --git a/augur/tasks/github/traffic.py b/augur/tasks/github/traffic.py new file mode 100644 index 0000000000..8f1903e4ea --- /dev/null +++ b/augur/tasks/github/traffic.py @@ -0,0 +1,67 @@ +import logging + +from augur.tasks.init.celery_app import celery_app as celery +from augur.application.db.data_parse import extract_needed_clone_history_data +from augur.tasks.util.worker_util import remove_duplicate_dicts +from augur.tasks.github.util.util import get_owner_repo +from augur.application.db.models import RepoClone +from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth + + +@celery.task +def collect_github_repo_clones_data(repo_git: str) -> None: + + logger = logging.getLogger(collect_github_repo_clones_data.__name__) + + repo_obj = get_repo_by_repo_git(repo_git) + repo_id = repo_obj.repo_id + + owner, repo = get_owner_repo(repo_git) + + logger.info(f"Collecting Github repository clone data for {owner}/{repo}") + + key_auth = GithubRandomKeyAuth(logger) + + clones_data = retrieve_all_clones_data(repo_git, logger, key_auth) + + if clones_data: + process_clones_data(clones_data, f"{owner}/{repo}: Traffic task", repo_id) + else: + logger.info(f"{owner}/{repo} has no clones") + +def retrieve_all_clones_data(repo_git: str, logger, key_auth): + # owner, repo = get_owner_repo(repo_git) + + # url = f"https://api.github.com/repos/{owner}/{repo}/traffic/clones" + + # clones = GithubPaginator(url, key_auth, logger) + + # num_pages = clones.get_num_pages() + all_data = [] + # for page_data, page in clones.iter_pages(): + + # if page_data is None: + # return all_data + + # elif len(page_data) == 0: + # logger.debug(f"{repo.capitalize()} Traffic Page {page} contains no data...returning") + # logger.info(f"Traffic Page {page} of {num_pages}") + # return all_data + + # logger.info(f"{repo} Traffic Page {page} of {num_pages}") + + # all_data += page_data + + return all_data + + +def process_clones_data(clones_data, task_name, repo_id, logger) -> None: + clone_history_data = clones_data[0]['clones'] + + clone_history_data_dicts = extract_needed_clone_history_data(clone_history_data, repo_id) + + clone_history_data = remove_duplicate_dicts(clone_history_data_dicts, 'clone_data_timestamp') + logger.info(f"{task_name}: Inserting {len(clone_history_data_dicts)} clone history records") + + bulk_insert_dicts(logger, clone_history_data_dicts, RepoClone, ['repo_id']) diff --git a/augur/tasks/github/traffic/__init__.py b/augur/tasks/github/traffic/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/augur/tasks/github/traffic/core.py b/augur/tasks/github/traffic/core.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/augur/tasks/github/traffic/tasks.py b/augur/tasks/github/traffic/tasks.py deleted file mode 100644 index 068c9616b7..0000000000 --- a/augur/tasks/github/traffic/tasks.py +++ /dev/null @@ -1,69 +0,0 @@ -import logging - -from augur.tasks.init.celery_app import celery_app as celery -from augur.application.db.data_parse import extract_needed_clone_history_data -from augur.tasks.github.util.github_paginator import GithubPaginator -from augur.tasks.github.util.github_task_session import GithubTaskManifest -from augur.tasks.util.worker_util import remove_duplicate_dicts -from augur.tasks.github.util.util import get_owner_repo -from augur.application.db.models import RepoClone, Repo -from augur.application.db.util import execute_session_query - -@celery.task -def collect_github_repo_clones_data(repo_git: str) -> None: - - logger = logging.getLogger(collect_github_repo_clones_data.__name__) - - # using GithubTaskSession to get our repo_obj for which we will store data of clones - with GithubTaskManifest(logger) as manifest: - - query = manifest.augur_db.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id - - owner, repo = get_owner_repo(repo_git) - - logger.info(f"Collecting Github repository clone data for {owner}/{repo}") - - clones_data = retrieve_all_clones_data(repo_git, logger, manifest.key_auth) - - if clones_data: - process_clones_data(clones_data, f"{owner}/{repo}: Traffic task", repo_id, manifest.augur_db) - else: - logger.info(f"{owner}/{repo} has no clones") - -def retrieve_all_clones_data(repo_git: str, logger, key_auth): - owner, repo = get_owner_repo(repo_git) - - url = f"https://api.github.com/repos/{owner}/{repo}/traffic/clones" - - clones = GithubPaginator(url, key_auth, logger) - - num_pages = clones.get_num_pages() - all_data = [] - for page_data, page in clones.iter_pages(): - - if page_data is None: - return all_data - - elif len(page_data) == 0: - logger.debug(f"{repo.capitalize()} Traffic Page {page} contains no data...returning") - logger.info(f"Traffic Page {page} of {num_pages}") - return all_data - - logger.info(f"{repo} Traffic Page {page} of {num_pages}") - - all_data += page_data - - return all_data - - -def process_clones_data(clones_data, task_name, repo_id, logger, augur_db) -> None: - clone_history_data = clones_data[0]['clones'] - - clone_history_data_dicts = extract_needed_clone_history_data(clone_history_data, repo_id) - - clone_history_data = remove_duplicate_dicts(clone_history_data_dicts, 'clone_data_timestamp') - logger.info(f"{task_name}: Inserting {len(clone_history_data_dicts)} clone history records") - - augur_db.insert_data(clone_history_data_dicts, RepoClone, ['repo_id']) diff --git a/augur/tasks/github/util/gh_graphql_entities.py b/augur/tasks/github/util/gh_graphql_entities.py index cb5df455b7..0667ab3315 100644 --- a/augur/tasks/github/util/gh_graphql_entities.py +++ b/augur/tasks/github/util/gh_graphql_entities.py @@ -250,9 +250,11 @@ def hit_api(self,query,variables={}): def extract_paginate_result(self,responseDict): if not responseDict: + self.logger.error(f"DEBUG CHECK THIS {responseDict}") raise TimeoutError("No data received from endpoint.") #err = process_graphql_dict_response(self.logger, responseObject, response) if 'data' not in responseDict: + self.logger.error(f"DEBUG CHECK THIS {responseDict}") self.logger.error(responseDict) raise KeyError @@ -293,6 +295,8 @@ def __getitem__(self, index):# -> dict: #extract the content from the graphql query result coreData = self.extract_paginate_result(data) + self.logger.debug(f"for page in range 298: {data}") + content = [data['node'] for data in list(coreData['edges'])] if self.repaginate: @@ -323,6 +327,8 @@ def __len__(self): data = self.request_graphql_dict(variables=params) coreData = self.extract_paginate_result(data) + self.logger.debug(f"__len__: debug: {data}") + totalCount = int(coreData['totalCount']) return totalCount @@ -338,17 +344,21 @@ def __iter__(self): #self.logger.info(f"{params}") data = self.request_graphql_dict(variables=params) try: - coreData = self.extract_paginate_result(data) - #Check to make sure we have data - coreData['totalCount'] + coreData = self.extract_paginate_result(data) + if coreData is not None: + if coreData.get('totalCount') is not None: + self.logger.info("debug-gog: ... core data obtained") + else: + self.logger.info(f"Helen, the ghost in our machine, did not get a numerical result for core data (value): {data} \n Zero value assigned.") + coreData['totalCount'] = 0 + else: + self.logger.error("Core data is None, cannot proceed with operations on it, but assigning a value of Zero to ensure continued collection.") + yield None + return except KeyError as e: self.logger.error("Could not extract paginate result because there was no data returned") - self.logger.error( - ''.join(traceback.format_exception(None, e, e.__traceback__))) - - self.logger.info(f"Graphql paramters: {params}") - return + self.logger.error(''.join(traceback.format_exception(None, e, e.__traceback__))) if int(coreData['totalCount']) == 0: @@ -380,6 +390,7 @@ def __iter__(self): data = self.request_graphql_dict(variables=params) coreData = self.extract_paginate_result(data) + self.logger.debug(f"while core data: {data}") #print(coreData) if len(coreData['edges']) == 0: diff --git a/augur/tasks/github/util/github_api_key_handler.py b/augur/tasks/github/util/github_api_key_handler.py index d87d7495eb..47933e67dc 100644 --- a/augur/tasks/github/util/github_api_key_handler.py +++ b/augur/tasks/github/util/github_api_key_handler.py @@ -6,10 +6,11 @@ from sqlalchemy.orm import Session from augur.tasks.util.redis_list import RedisList -from augur.application.db.session import DatabaseSession -from augur.application.db.lib import get_value +from augur.application.db.lib import get_value, get_worker_oauth_keys from sqlalchemy import func +RATE_LIMIT_URL = "https://api.github.com/rate_limit" + class NoValidKeysError(Exception): pass @@ -19,7 +20,6 @@ class GithubApiKeyHandler(): """Handles Github API key retrieval from the database and redis Attributes: - session (DatabaseSession): Database connection logger (logging.Logger): Handles all logs oauth_redis_key (str): The key where the github api keys are cached in redis redis_key_list (RedisList): Acts like a python list, and interacts directly with the redis cache @@ -27,9 +27,8 @@ class GithubApiKeyHandler(): key: (List[str]): List of keys retrieve from database or cache """ - def __init__(self, session: Session, logger): + def __init__(self, logger): - self.session = session self.logger = logger self.oauth_redis_key = "github_oauth_keys_list" @@ -69,16 +68,12 @@ def get_api_keys_from_database(self) -> List[str]: Returns: Github api keys that are in the database """ - from augur.application.db.models import WorkerOauth - select = WorkerOauth.access_token - # randomizing the order at db time - #select.order_by(func.random()) - where = [WorkerOauth.access_token != self.config_key, WorkerOauth.platform == 'github'] + keys = get_worker_oauth_keys('github') - return [key_tuple[0] for key_tuple in self.session.query(select).filter(*where).order_by(func.random()).all()] - #return [key_tuple[0] for key_tuple in self.session.query(select).filter(*where).all()] + filtered_keys = [item for item in keys if item != self.config_key] + return filtered_keys def get_api_keys(self) -> List[str]: """Retrieves all valid Github API Keys @@ -112,6 +107,8 @@ def get_api_keys(self) -> List[str]: if len(keys) == 0: return [] + + keys = [key.strip() for key in keys] valid_keys = [] with httpx.Client() as client: @@ -159,12 +156,9 @@ def is_bad_api_key(self, client: httpx.Client, oauth_key: str) -> bool: True if key is bad. False if the key is good """ - # this endpoint allows us to check the rate limit, but it does not use one of our 5000 requests - url = "https://api.github.com/rate_limit" - headers = {'Authorization': f'token {oauth_key}'} - data = client.request(method="GET", url=url, headers=headers, timeout=180).json() + data = client.request(method="GET", url=RATE_LIMIT_URL, headers=headers, timeout=180).json() try: if data["message"] == "Bad credentials": @@ -172,4 +166,25 @@ def is_bad_api_key(self, client: httpx.Client, oauth_key: str) -> bool: except KeyError: pass - return False \ No newline at end of file + return False + + @staticmethod + def get_key_rate_limit(client, github_key): + + headers = {'Authorization': f'token {github_key}'} + + data = client.request(method="GET", url=RATE_LIMIT_URL, headers=headers, timeout=180).json() + + if "message" in data: + return None, None + + def convert_rate_limit_request(data): + return { + "requests_remaining": data["remaining"], + "reset_epoch": data["reset"] + } + + core_data = convert_rate_limit_request(data["resources"]["core"]) + graphql_data = convert_rate_limit_request(data["resources"]["graphql"]) + + return core_data, graphql_data \ No newline at end of file diff --git a/augur/tasks/github/util/github_data_access.py b/augur/tasks/github/util/github_data_access.py new file mode 100644 index 0000000000..06cbd3104c --- /dev/null +++ b/augur/tasks/github/util/github_data_access.py @@ -0,0 +1,225 @@ +import logging +import time +import httpx +from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_exception, RetryError +from urllib.parse import urlparse, parse_qs, urlencode +from keyman.KeyClient import KeyClient + +GITHUB_RATELIMIT_REMAINING_CAP = 50 + + +class RatelimitException(Exception): + + def __init__(self, response, keys_used, message="Github Rate limit exceeded") -> None: + + self.response = response + + super().__init__(f"{message}. Keys used: {keys_used}") + +class UrlNotFoundException(Exception): + pass + +class NotAuthorizedException(Exception): + pass + +class GithubDataAccess: + + def __init__(self, key_manager, logger: logging.Logger): + + self.logger = logger + self.key_client = KeyClient("github_rest", logger) + self.key = None + self.expired_keys_for_request = [] + + def get_resource_count(self, url): + + # set per_page to 100 explicitly so we know each page is 100 long + params = {"per_page": 100} + url = self.__add_query_params(url, params) + + num_pages = self.get_resource_page_count(url) + + # get data for last page + params = {"page": num_pages} + url = self.__add_query_params(url, params) + + data = self.get_resource(url) + + return (100 * (num_pages -1)) + len(data) + + def paginate_resource(self, url): + + response = self.make_request_with_retries(url) + data = response.json() + + # need to ensure data is a list so yield from works properly + if not isinstance(data, list): + raise Exception(f"GithubApiHandler.paginate_resource must be used with url that returns a list. Use GithubApiHandler.get_resource to retrieve data that is not paginated. The url of {url} returned a {type(data)}.") + + yield from data + + while 'next' in response.links.keys(): + + next_page = response.links['next']['url'] + + response = self.make_request_with_retries(next_page) + data = response.json() + + # need to ensure data is a list so yield from works properly + if not isinstance(data, list): + raise Exception(f"GithubApiHandler.paginate_resource must be used with url that returns a list. Use GithubApiHandler.get_resource to retrieve data that is not paginated. The url of {url} returned a {type(data)}. ") + + yield from data + + return + + def get_resource_page_count(self, url): + + response = self.make_request_with_retries(url, method="HEAD") + + if 'last' not in response.links.keys(): + self.logger.warning(f"Github response without links. Headers: {response.headers}.") + return 1 + + try: + last_page_url = response.links['last']['url'] + + parsed_url = urlparse(last_page_url) + + return int(parse_qs(parsed_url.query)['page'][0]) + except (KeyError, ValueError): + raise Exception(f"Unable to parse 'last' url from response: {response.links['last']}") + + def get_resource(self, url): + + response = self.make_request_with_retries(url) + + return response.json() + + # TODO: Handle timeout exceptions better + def make_request(self, url, method="GET", timeout=100): + + with httpx.Client() as client: + + if not self.key: + self.key = self.key_client.request() + + headers = {"Authorization": f"token {self.key}"} + + response = client.request(method=method, url=url, headers=headers, timeout=timeout, follow_redirects=True) + + if response.status_code in [403, 429]: + self.expired_keys_for_request.append(self.key) + raise RatelimitException(response, self.expired_keys_for_request[-5:]) + + if response.status_code == 404: + raise UrlNotFoundException(f"Could not find {url}") + + if response.status_code == 401: + raise NotAuthorizedException(f"Could not authorize with the github api") + + response.raise_for_status() + + try: + if "X-RateLimit-Remaining" in response.headers and int(response.headers["X-RateLimit-Remaining"]) < GITHUB_RATELIMIT_REMAINING_CAP: + self.expired_keys_for_request.append(self.key) + raise RatelimitException(response, self.expired_keys_for_request[-5:]) + except ValueError: + self.logger.warning(f"X-RateLimit-Remaining was not an integer. Value: {response.headers['X-RateLimit-Remaining']}") + + + return response + + def make_request_with_retries(self, url, method="GET", timeout=100): + """ What method does? + 1. Catches RetryError and rethrows a nicely formatted OutOfRetriesException that includes that last exception thrown + """ + + try: + return self.__make_request_with_retries(url, method, timeout) + except RetryError as e: + raise e.last_attempt.exception() + + @retry(stop=stop_after_attempt(10), wait=wait_fixed(5), retry=retry_if_exception(lambda exc: not isinstance(exc, UrlNotFoundException))) + def __make_request_with_retries(self, url, method="GET", timeout=100): + """ What method does? + 1. Retires 10 times + 2. Waits 5 seconds between retires + 3. Does not rety UrlNotFoundException + 4. Catches RatelimitException and waits or expires key before raising exception + """ + + try: + result = self.make_request(url, method, timeout) + self.expired_keys_for_request = [] + return result + except RatelimitException as e: + self.__handle_github_ratelimit_response(e.response) + raise e + except NotAuthorizedException as e: + self.expired_keys_for_request = [] + self.__handle_github_not_authorized_response() + raise e + + def __handle_github_not_authorized_response(self): + + self.key = self.key_client.invalidate(self.key) + + + def __handle_github_ratelimit_response(self, response): + + headers = response.headers + previous_key = self.key + + if "Retry-After" in headers: + + retry_after = int(headers["Retry-After"]) + self.logger.info('\n\n\n\nEncountered secondary rate limit issue.\n\n\n\n') + self.key = self.key_client.expire(self.key, time.time() + retry_after) + + elif "X-RateLimit-Remaining" in headers and int(headers["X-RateLimit-Remaining"]) < GITHUB_RATELIMIT_REMAINING_CAP: + current_epoch = int(time.time()) + epoch_when_key_resets = int(headers["X-RateLimit-Reset"]) + key_reset_time = epoch_when_key_resets - current_epoch + + if key_reset_time < 0: + self.logger.error(f"Key reset time was less than 0 setting it to 0.\nThe current epoch is {current_epoch} and the epoch that the key resets at is {epoch_when_key_resets}") + key_reset_time = 0 + + self.logger.info(f"\n\n\nAPI rate limit exceeded. Key resets in {key_reset_time} seconds. Informing key manager that key is expired") + self.key = self.key_client.expire(self.key, epoch_when_key_resets) + + else: + self.key = self.key_client.expire(self.key, time.time() + 60) + + if previous_key == self.key: + self.logger.error(f"The same key was returned after a request to expire it was sent (key: {self.key[-5:]})") + + def __add_query_params(self, url: str, additional_params: dict) -> str: + """Add query params to a url. + + Args: + url: the url that is being modified + additional_params: key value pairs specififying the paramaters to be added + + Returns: + The url with the key value pairs in additional_params added as query params + """ + url_components = urlparse(url) + original_params = parse_qs(url_components.query) + # Before Python 3.5 you could update original_params with + # additional_params, but here all the variables are immutable. + merged_params = {**original_params, **additional_params} + updated_query = urlencode(merged_params, doseq=True) + # _replace() is how you can create a new NamedTuple with a changed field + return url_components._replace(query=updated_query).geturl() + + + + + + + + + + diff --git a/augur/tasks/github/util/github_graphql_data_access.py b/augur/tasks/github/util/github_graphql_data_access.py new file mode 100644 index 0000000000..96b0c6ab76 --- /dev/null +++ b/augur/tasks/github/util/github_graphql_data_access.py @@ -0,0 +1,224 @@ +import logging +import time +import httpx +from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_exception, RetryError + +URL = "https://api.github.com/graphql" + +class RatelimitException(Exception): + + def __init__(self, response, message="Github Rate limit exceeded") -> None: + + self.response = response + + super().__init__(message) + +class NotFoundException(Exception): + pass + +class InvalidDataException(Exception): + pass + +class GithubGraphQlDataAccess: + + + def __init__(self, key_manager, logger: logging.Logger, ingore_not_found_error=False): + + self.logger = logger + self.key_manager = key_manager + self.ingore_not_found_error = ingore_not_found_error + + def get_resource(self, query, variables, result_keys): + + result_json = self.make_request_with_retries(query, variables).json() + + data = self.__extract_data_section(result_keys, result_json) + + return data + + + def paginate_resource(self, query, variables, result_keys): + + params = { + "numRecords" : 100, + "cursor" : None + } + params.update(variables) + + result_json = self.make_request_with_retries(query, params).json() + + data = self.__extract_data_section(result_keys, result_json) + + if self.__get_total_count(data) == 0: + return + + yield from self.__extract_raw_data_into_list(data) + + while self.__has_next_page(data): + params["cursor"] = self.__get_next_page_cursor(data) + + result_json = self.make_request_with_retries(query, params).json() + + data = self.__extract_data_section(result_keys, result_json) + + yield from self.__extract_raw_data_into_list(data) + + def make_request(self, query, variables, timeout=40): + + with httpx.Client() as client: + + json_dict = { + 'query' : query + } + + if variables: + json_dict['variables'] = variables + + response = client.post(url=URL,auth=self.key_manager,json=json_dict, timeout=timeout) + + response.raise_for_status() + + if not self.ingore_not_found_error: + + json_response = response.json() + if "errors" in json_response and len(json_response["errors"]) > 0: + errors = json_response["errors"] + + not_found_error = self.__find_first_error_of_type(errors, "NOT_FOUND") + + if not_found_error: + message = not_found_error.get("message", "Resource not found.") + raise NotFoundException(f"Could not find: {message}") + + raise Exception(f"Github Graphql Data Access Errors: {errors}") + + return response + + + def make_request_with_retries(self, query, variables, timeout=100): + """ What method does? + 1. Catches RetryError and rethrows a nicely formatted OutOfRetriesException that includes that last exception thrown + """ + + try: + return self.__make_request_with_retries(query, variables, timeout) + except RetryError as e: + raise e.last_attempt.exception() + + @retry(stop=stop_after_attempt(10), wait=wait_fixed(5), retry=retry_if_exception(lambda exc: not isinstance(exc, NotFoundException))) + def __make_request_with_retries(self, query, variables, timeout=40): + """ What method does? + 1. Retires 10 times + 2. Waits 5 seconds between retires + 3. Does not rety UrlNotFoundException + 4. Catches RatelimitException and waits before raising exception + """ + + try: + return self.make_request(query, variables, timeout) + except RatelimitException as e: + self.__handle_github_ratelimit_response(e.response) + raise e + + def __handle_github_ratelimit_response(self, response): + + headers = response.headers + + if "Retry-After" in headers: + + retry_after = int(headers["Retry-After"]) + self.logger.info( + f'\n\n\n\nSleeping for {retry_after} seconds due to secondary rate limit issue.\n\n\n\n') + time.sleep(retry_after) + + elif "X-RateLimit-Remaining" in headers and int(headers["X-RateLimit-Remaining"]) == 0: + current_epoch = int(time.time()) + epoch_when_key_resets = int(headers["X-RateLimit-Reset"]) + key_reset_time = epoch_when_key_resets - current_epoch + + if key_reset_time < 0: + self.logger.error(f"Key reset time was less than 0 setting it to 0.\nThe current epoch is {current_epoch} and the epoch that the key resets at is {epoch_when_key_resets}") + key_reset_time = 0 + + self.logger.info(f"\n\n\nAPI rate limit exceeded. Sleeping until the key resets ({key_reset_time} seconds)") + time.sleep(key_reset_time) + else: + time.sleep(60) + + def __extract_data_section(self, keys, json_response): + + if not json_response: + raise Exception(f"Empty data returned. Data: {json_response}") + + if 'data' not in json_response: + raise Exception(f"Error: 'data' key missing from response. Response: {json_response}") + + core = json_response['data'] + + # iterate deeper into the json_response object until we get to the desired data + for value in keys: + + if core is None: + raise Exception(f"Error: 'core' is None when trying to index by {value}. Response: {json_response}") + + core = core[value] + + if core is None: + raise InvalidDataException(f"Error: The data section is null. Unable to process") + + return core + + def __extract_raw_data_into_list(self, data): + + if 'edges' not in data: + raise Exception(f"Error: 'edges' key not present in data. Data {data}") + + data_list = [] + for edge in data['edges']: + + if 'node' not in edge: + raise Exception(f"Error: 'node' key not present in data. Data {data}") + + data_list.append(edge['node']) + + return data_list + + def __has_next_page(self, data): + + if 'pageInfo' not in data: + raise Exception(f"Error: 'pageInfo' key not present in data. Data {data}") + + if 'hasNextPage' not in data['pageInfo']: + raise Exception(f"Error: 'hasNextPage' key not present in data. Data {data}") + + if not isinstance(data['pageInfo']['hasNextPage'], bool): + raise Exception(f"Error: pageInfo.hasNextPage is not a bool. Data {data}") + + return data['pageInfo']['hasNextPage'] + + def __get_next_page_cursor(self, data): + + if 'pageInfo' not in data: + raise Exception(f"Error: 'pageInfo' key not present in data. Data {data}") + + if 'endCursor' not in data['pageInfo']: + raise Exception(f"Error: 'endCursor' key not present in data. Data {data}") + + return data['pageInfo']['endCursor'] + + def __get_total_count(self, data): + + if 'totalCount' not in data: + raise Exception(f"Error: totalCount key not found in data. Data: {data}") + + if data["totalCount"] is None: + raise Exception(f"Error: totalCount is null. Data: {data}") + + try: + return int(data["totalCount"]) + except ValueError as exc: + raise Exception(f"Error: totalCount is not an integer. Data: {data}") from exc + + def __find_first_error_of_type(self, errors, type): + + return next((error for error in errors if error.get("type").lower() == type.lower()), None) diff --git a/augur/tasks/github/util/github_paginator.py b/augur/tasks/github/util/github_paginator.py index 1818aef31e..bd141d0c32 100644 --- a/augur/tasks/github/util/github_paginator.py +++ b/augur/tasks/github/util/github_paginator.py @@ -1,20 +1,13 @@ """Logic to paginate the Github API.""" -import collections import httpx import time -import json import logging -from typing import List, Optional, Union, Generator, Tuple -from urllib.parse import urlencode, urlparse, parse_qs, urlunparse +from typing import Optional from enum import Enum - -from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth -from augur.tasks.github.util.util import parse_json_response - def hit_api(key_manager, url: str, logger: logging.Logger, timeout: float = 10, method: str = 'GET', ) -> Optional[httpx.Response]: """Ping the api and get the data back for the page. @@ -157,472 +150,3 @@ class GithubApiResult(Enum): BAD_CREDENTIALS = 7 HTML = 8 EMPTY_STRING = 9 - - -class GithubPaginator(collections.abc.Sequence): - """This class is a sequence that handles paginating through data on the Github API. - - Attributes: - url (str): The url that we are collecting data - key_mangager (GithubRandomKeyAuth): Custom httpx auth class - that randomizes the github api key a request gets. - This is how the requests are getting their api keys - logger (logging.Logger): Logger that handler printing information to files and stdout - """ - - def __init__(self, url: str, key_manager: GithubRandomKeyAuth, logger: logging.Logger, from_datetime=None, to_datetime=None): - """Initialize the class GithubPaginator. - - Args: - url: url that the data is being collected - key_manager: class that randomly selects a Github API key for each request - logger: handles logging - from_datetime: collects data after this datatime (not yet implemented) - to_datetime: collects data before this datatime (not yet implemented) - """ - remove_fields = ["per_page", "page"] - url = clean_url(url, remove_fields) - - # we need to add query params directly to the url, instead of passing the param to the httpx.Client.request - # this is because github will only append specified params to the links in the headers if they are a part - # of the url, and not the params with the request - params = {"per_page": 100} - url = add_query_params(url, params) - - self.url = url - self.key_manager = key_manager - self.logger = logger - - # get the logger from the key manager - # self.logger = key_manager.logger - - self.from_datetime = from_datetime - self.to_datetime = to_datetime - - def __getitem__(self, index: int) -> Optional[dict]: - """Get the value at index of the Github API data returned from the url. - - Args: - index: The index of the desired data from the Github API - - Returns: - The value at the index - """ - # if isinstance(index, slice) is True: - - # data_slice = index - # start = data_slice.start - # stop = data_slice.stop - # step = data_slice.step - - # first_item_page = (start // 100) + 1 - # end_item_page = (stop // 100) + 1 - - # all_data: List[dict] = [] - - # for page_number in range(first_item_page, end_item_page+1): - - # # create url to query - # params = {"page": items_page} - # url = add_query_params(self.url, params) - - # data, _ = self.retrieve_data(url) - - # all_data += data - - # first_page_index = start % 100 - - # needed_data = [] - # for index in range(start, stop, step): - # needed_data.append(all_data[index]) - - # return needed_data - - - # get the page the item is on - items_page = (index // 100) + 1 - - # create url to query - params = {"page": items_page} - url = add_query_params(self.url, params) - - data, _, result = self.retrieve_data(url) - - if result != GithubApiResult.SUCCESS: - self.logger.debug("Unable to get item from the api") - return None - - # get the position of data on the page - page_index = index % 100 - - try: - return data[page_index] - except KeyError as e: - raise KeyError("Data does not exists for that index") from e - - def __len__(self): - """Get the length of the Github API data. - - Returns: - The length of the Github API data at the url. - - Examples: - This function is called when len() is called on the GithubPaginator class for example. - - issues = GithubPaginator(url, session.oauths, logger) - issue_len = len(issues) - """ - - num_pages = self.get_num_pages() - - self.logger.info(f"Num pages: {num_pages}") - - params = {"page": num_pages} - url = add_query_params(self.url, params) - - # get the amount of data on last page - data, _, result = self.retrieve_data(url) - - if result == GithubApiResult.SUCCESS: - return (100 * (num_pages -1)) + len(data) - - self.logger.debug("Unable to retrieve data length from api") - return 0 - - def __iter__(self) -> Generator[Optional[dict], None, None]: - """Provide data from Github API via a generator that yields one dict at a time. - - Yields: - A piece of data from the github api as the specified url - """ - data_list, response, result = self.retrieve_data(self.url) - - if result != GithubApiResult.SUCCESS: - self.logger.debug("Failed to retrieve the data even though 10 attempts were given") - yield None - return - - # yield the first page data - for data in data_list: - yield data - - while 'next' in response.links.keys(): - next_page = response.links['next']['url'] - - # Here we don't need to pass in params with the page, or the default params because the url from the headers already has those values - data_list, response, result = self.retrieve_data(next_page) - - if result != GithubApiResult.SUCCESS: - self.logger.debug("Failed to retrieve the data even though 10 attempts were given") - return - - for data in data_list: - yield data - - def iter_pages(self) -> Generator[Tuple[Optional[List[dict]], int], None, None]: - """Provide data from Github API via a generator that yields a page of dicts at a time. - - Returns: - A page of data from the Github API at the specified url - """ - # retrieves the data for the given url - data_list, response, result = self.retrieve_data(self.url) - - if result != GithubApiResult.SUCCESS: - self.logger.debug("Failed to retrieve the data even though 10 attempts were given") - yield None, None - return - - # this retrieves the page for the given url - page_number = get_url_page_number(self.url) - - # yields the first page of data and its page number - yield data_list, page_number - - while 'next' in response.links.keys(): - - # gets the next page from the last responses header - next_page = response.links['next']['url'] - - # Here we don't need to pass in params with the page, or the default params because the url from the headers already has those values - data_list, response, result = self.retrieve_data(next_page) - - if result != GithubApiResult.SUCCESS: - self.logger.debug(f"Failed to retrieve the data for even though 10 attempts were given. Url: {next_page}") - return - - page_number = get_url_page_number(next_page) - - # if either the data or response is None then yield None and return - if data_list is None or response is None: - return - - # yield the data from the page and its number - yield data_list, page_number - - def retrieve_data(self, url: str) -> Tuple[Optional[List[dict]], Optional[httpx.Response]]: - """Attempt to retrieve data at given url. - - Args: - url: The url to retrieve the data from - - Returns - The response object from hitting the url and the data on the page - """ - timeout = 30 - timeout_count = 0 - num_attempts = 1 - while num_attempts <= 10: - - response = hit_api(self.key_manager, url, self.logger, timeout) - - if response is None: - if timeout_count == 10: - self.logger.error(f"Request timed out 10 times for {url}") - return None, None, GithubApiResult.TIMEOUT - - timeout = timeout * 1.1 - num_attempts += 1 - continue - - # if api returns a status of 204 No Content then return empty list - if response.status_code == 204: - return [], response, GithubApiResult.SUCCESS - - - page_data = parse_json_response(self.logger, response) - - - # if the data is a list, then return it and the response - if isinstance(page_data, list) is True: - return page_data, response, GithubApiResult.SUCCESS - - # if the data is a dict then call process_dict_response, and - if isinstance(page_data, dict) is True: - dict_processing_result = process_dict_response(self.logger, response, page_data) - - if dict_processing_result == GithubApiResult.NEW_RESULT: - self.logger.info(f"Encountered new dict response from api on url: {url}. Response: {page_data}") - return None, None, GithubApiResult.NEW_RESULT - - if dict_processing_result == GithubApiResult.REPO_NOT_FOUND: - return None, response, GithubApiResult.REPO_NOT_FOUND - - if dict_processing_result in (GithubApiResult.SECONDARY_RATE_LIMIT, GithubApiResult.ABUSE_MECHANISM_TRIGGERED): - continue - - if dict_processing_result == GithubApiResult.RATE_LIMIT_EXCEEDED: - num_attempts = 0 - continue - - if isinstance(page_data, str) is True: - str_processing_result: Union[str, List[dict]] = self.process_str_response(page_data) - - if isinstance(str_processing_result, list): - return str_processing_result, response, GithubApiResult.SUCCESS - - num_attempts += 1 - - self.logger.error("Unable to collect data in 10 attempts") - return None, None, GithubApiResult.NO_MORE_ATTEMPTS - - def get_num_pages(self) -> Optional[int]: - """Get the number of pages of data that a url can paginate through. - - Returns: - The number of pages a url can access - """ - timeout: float = 5 - num_attempts = 0 - while num_attempts < 10: - r = hit_api(self.key_manager, self.url, self.logger, timeout=timeout, method="HEAD") - - if r: - break - - timeout = timeout * 1.2 - else: - raise RuntimeError("Unable to get the number of pages of data in 10 attempts") - - if 'last' not in r.links.keys(): - return 1 - - # get the last url from header - last_page_url = r.links['last']['url'] - - parsed_url = urlparse(last_page_url) - try: - num_pages = int(parse_qs(parsed_url.query)['page'][0]) - except (KeyError, ValueError): - return None - - return num_pages - - def hit_api(self, url, timeout): - - return hit_api(self.key_manager, url, self.logger, timeout) - - -################################################### - - def process_str_response(self, page_data: str) -> Union[str, List[dict]]: - """Process an api response of type string. - - Args: - page_data: the string response from the api that is being processed - - Returns: - html_response, empty_string, and failed_to_parse_jsonif the data is not processable. - Or a list of dicts if the json was parasable - """ - self.logger.info(f"Warning! page_data was string: {page_data}\n") - - if "" in page_data: - self.logger.info("HTML was returned, trying again...\n") - return GithubApiResult.HTML - - if not page_data: - self.logger.info("Empty string, trying again...\n") - return GithubApiResult.EMPTY_STRING - - try: - list_of_dict_page_data = json.loads(page_data) - return list_of_dict_page_data - except TypeError: - return "failed_to_parse_json" - - -################################################################################ - -# Url Helper Method to remove query paramaters from the url -def clean_url(url: str, keys: List[str]) -> str: - """Remove query params from url. - - Args: - url: the url that is being modified - keys: the query params that are being removed - - Returns: - A url with the params in keys removed - """ - u = urlparse(url) - query = parse_qs(u.query, keep_blank_values=True) - - for key in keys: - query.pop(key, None) - - u = u._replace(query=urlencode(query, True)) - - return urlunparse(u) - - -def add_query_params(url: str, additional_params: dict) -> str: - """Add query params to a url. - - Args: - url: the url that is being modified - additional_params: key value pairs specififying the paramaters to be added - - Returns: - The url with the key value pairs in additional_params added as query params - """ - url_components = urlparse(url) - original_params = parse_qs(url_components.query) - # Before Python 3.5 you could update original_params with - # additional_params, but here all the variables are immutable. - merged_params = {**original_params, **additional_params} - updated_query = urlencode(merged_params, doseq=True) - # _replace() is how you can create a new NamedTuple with a changed field - return url_components._replace(query=updated_query).geturl() - - - -################################################################################ - - -def get_url_page_number(url: str) -> int: - """Parse the page number from the url. - - Note: - If the url does not contain a page number the function returns 1 - - Args: - url: url to get the page number from - - Returns: - The page number that the url contains - """ - parsed_url = urlparse(url) - try: - # if page is not a url query param then this is page 1 - page_number = int(parse_qs(parsed_url.query)['page'][0]) - - except KeyError: - return 1 - - return page_number - - -def retrieve_dict_from_endpoint(logger, key_auth, url, timeout_wait=10) -> Tuple[Optional[dict], GithubApiResult]: - timeout = timeout_wait - timeout_count = 0 - num_attempts = 1 - - while num_attempts <= 10: - - response = hit_api(key_auth, url, logger, timeout) - - if response is None: - if timeout_count == 10: - logger.error(f"Request timed out 10 times for {url}") - return None, GithubApiResult.TIMEOUT - - timeout = timeout * 1.1 - num_attempts += 1 - continue - - - page_data = parse_json_response(logger, response) - - if isinstance(page_data, str): - # TODO: Define process_str_response as outside the class and fix this reference - str_processing_result: Union[str, List[dict]] = process_str_response(logger,page_data) - - if isinstance(str_processing_result, dict): - #return str_processing_result, response, GithubApiResult.SUCCESS - page_data = str_processing_result - else: - num_attempts += 1 - continue - - # if the data is a list, then return it and the response - if isinstance(page_data, list): - logger.warning("Wrong type returned, trying again...") - logger.info(f"Returned list: {page_data}") - - # if the data is a dict then call process_dict_response, and - elif isinstance(page_data, dict): - dict_processing_result = process_dict_response(logger, response, page_data) - - if dict_processing_result == GithubApiResult.SUCCESS: - return page_data, dict_processing_result - if dict_processing_result == GithubApiResult.NEW_RESULT: - logger.info(f"Encountered new dict response from api on url: {url}. Response: {page_data}") - return None, GithubApiResult.NEW_RESULT - - if dict_processing_result == GithubApiResult.REPO_NOT_FOUND: - return None, GithubApiResult.REPO_NOT_FOUND - - if dict_processing_result in (GithubApiResult.SECONDARY_RATE_LIMIT, GithubApiResult.ABUSE_MECHANISM_TRIGGERED): - continue - - if dict_processing_result == GithubApiResult.RATE_LIMIT_EXCEEDED: - num_attempts = 0 - continue - - - - num_attempts += 1 - - logger.error("Unable to collect data in 10 attempts") - return None, GithubApiResult.NO_MORE_ATTEMPTS diff --git a/augur/tasks/github/util/github_random_key_auth.py b/augur/tasks/github/util/github_random_key_auth.py index ed539430d8..1c7fc74e84 100644 --- a/augur/tasks/github/util/github_random_key_auth.py +++ b/augur/tasks/github/util/github_random_key_auth.py @@ -9,12 +9,12 @@ class GithubRandomKeyAuth(RandomKeyAuth): github collections can have a class randomly selects an api key for each request """ - def __init__(self, session: Session, logger): + def __init__(self, logger): """Creates a GithubRandomKeyAuth object and initializes the RandomKeyAuth parent class""" # gets the github api keys from the database via the GithubApiKeyHandler - github_api_keys = GithubApiKeyHandler(session, logger).keys + github_api_keys = GithubApiKeyHandler(logger).keys #github_api_keys = random.sample(github_api_keys, len(github_api_keys)) if not github_api_keys: @@ -24,4 +24,23 @@ def __init__(self, session: Session, logger): header_name = "Authorization" key_format = "token {0}" - super().__init__(github_api_keys, header_name, logger, key_format) \ No newline at end of file + super().__init__(github_api_keys, header_name, logger, key_format) + + # It needs to be this at some point, however not all the method calls are sending 3 arguments + + # def __init__(self, session: Session, logger): + # """Creates a GithubRandomKeyAuth object and initializes the RandomKeyAuth parent class""" + + + # # gets the github api keys from the database via the GithubApiKeyHandler + # github_api_keys = GithubApiKeyHandler(session, logger).keys + # #github_api_keys = random.sample(github_api_keys, len(github_api_keys)) + + # if not github_api_keys: + # print("Failed to find github api keys. This is usually because your key has expired") + + # # defines the structure of the github api key + # header_name = "Authorization" + # key_format = "token {0}" + + # super().__init__(github_api_keys, header_name, logger, key_format) \ No newline at end of file diff --git a/augur/tasks/github/util/github_task_session.py b/augur/tasks/github/util/github_task_session.py index 0acbbf64cd..2869643bd0 100644 --- a/augur/tasks/github/util/github_task_session.py +++ b/augur/tasks/github/util/github_task_session.py @@ -11,7 +11,9 @@ def __init__(self, logger): engine = get_engine() self.augur_db = DatabaseSession(logger, engine) - self.key_auth = GithubRandomKeyAuth(self.augur_db.session, logger) + #self.key_auth = GithubRandomKeyAuth(self.augur_db.session, logger) + #totalHack + self.key_auth = GithubRandomKeyAuth(logger) self.logger = logger self.platform_id = 1 @@ -35,10 +37,28 @@ class GithubTaskSession(DatabaseSession): platform_id (int): The id that refers to the Github platform """ + #def __init__(self, logger: Logger, engine=None): def __init__(self, logger: Logger, engine=None): super().__init__(logger, engine=engine) - - self.oauths = GithubRandomKeyAuth(self, logger) - self.platform_id = 1 + self.oauths = GithubRandomKeyAuth(logger) + #self.oauths = GithubRandomKeyAuth(self, logger) # Removed and replaced for the issue below in frontend.py + ''' + Debugging this issue: + Traceback (most recent call last): + File "/home/ubuntu/github/virtualenvs/hosted/lib/python3.11/site-packages/celery/app/trace.py", line 451, in trace_task + R = retval = fun(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/github/virtualenvs/hosted/lib/python3.11/site-packages/celery/app/trace.py", line 734, in __protected_call__ + return self.run(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/github/augur/augur/tasks/frontend.py", line 24, in add_org_repo_list + with GithubTaskSession(logger) as session: + ^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/github/augur/augur/tasks/github/util/github_task_session.py", line 44, in __init__ + self.oauths = GithubRandomKeyAuth(self, logger) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + TypeError: GithubRandomKeyAuth.__init__() takes 2 positional arguments but 3 were given + ''' + self.platform_id = 1 \ No newline at end of file diff --git a/augur/tasks/github/util/populate_repo_src_id.py b/augur/tasks/github/util/populate_repo_src_id.py new file mode 100644 index 0000000000..4346bcc3dd --- /dev/null +++ b/augur/tasks/github/util/populate_repo_src_id.py @@ -0,0 +1,29 @@ +import logging +import sqlalchemy as s + +from augur.tasks.init.celery_app import celery_app as celery +from augur.application.db.lib import get_repo_by_repo_git, execute_sql +from augur.tasks.github.util.util import get_owner_repo, get_repo_src_id + +@celery.task +def populate_repo_src_id_task(repo_git): + + logger = logging.getLogger(populate_repo_src_id_task.__name__) + + repo_obj = get_repo_by_repo_git(repo_git) + repo_id = repo_obj.repo_id + + owner, repo = get_owner_repo(repo_git) + + repo_src_id = get_repo_src_id(owner, repo, logger) + + update_repo_src_id(repo_id, repo_src_id) + + return 0 + +def update_repo_src_id(repo_id, repo_src_id): + + query = s.sql.text("""UPDATE repo SET repo_src_id=:repo_src_id WHERE repo_id=:repo_id; + """).bindparams(repo_src_id=repo_src_id, repo_id=repo_id) + + execute_sql(query) diff --git a/augur/tasks/github/util/util.py b/augur/tasks/github/util/util.py index 5dfe100977..76e5419924 100644 --- a/augur/tasks/github/util/util.py +++ b/augur/tasks/github/util/util.py @@ -1,13 +1,40 @@ """Utility functions that are useful for several Github tasks""" from typing import Any, List, Tuple import logging +import urllib.parse import json import httpx -from augur.tasks.github.util.github_task_session import GithubTaskManifest -from augur.application.db.session import DatabaseSession -from augur.application.db.models import Repo +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth +from augur.tasks.github.util.github_graphql_data_access import GithubGraphQlDataAccess +from augur.application.db.lib import get_repo_by_repo_git from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps +def get_repo_src_id(owner, repo, logger): + + + query = """query($repo: String!, $owner: String!) { + repository(name: $repo, owner: $owner) { + databaseId + } + } + """ + + key_auth = GithubRandomKeyAuth(logger) + + github_graphql_data_access = GithubGraphQlDataAccess(key_auth, logger) + + variables = { + "owner": owner, + "repo": repo + } + + result_keys = ["repository", "databaseId"] + + repo_src_id = github_graphql_data_access.get_resource(query, variables, result_keys) + + return repo_src_id + + # This function adds a key value pair to a list of dicts and returns the modified list of dicts back def add_key_value_pair_to_dicts(data: List[dict], key: str, value: Any) -> List[dict]: @@ -47,6 +74,10 @@ def get_owner_repo(git_url: str) -> Tuple[str, str]: return owner, repo +def get_gitlab_repo_identifier(owner, repo): + + return urllib.parse.quote(f"{owner}/{repo}", safe='') + def parse_json_response(logger: logging.Logger, response: httpx.Response) -> dict: # try to get json from response @@ -72,29 +103,27 @@ def get_repo_weight_by_issue(logger,repo_git): owner,name = get_owner_repo(repo_git) - with GithubTaskManifest(logger) as manifest: - repo_graphql = GitHubRepoGraphql(logger, manifest.key_auth, owner, name) - number_of_issues_and_prs = len(repo_graphql.get_issues_collection()) + len(repo_graphql.get_pull_requests_collection()) + key_auth = GithubRandomKeyAuth(logger) + + repo_graphql = GitHubRepoGraphql(logger, key_auth, owner, name) + number_of_issues_and_prs = len(repo_graphql.get_issues_collection()) + len(repo_graphql.get_pull_requests_collection()) return number_of_issues_and_prs #Get the weight for each repo for the core collection hook def get_repo_weight_core(logger,repo_git): - from augur.application.db import get_engine - engine = get_engine() - - with DatabaseSession(logger,engine) as session: - repo = Repo.get_by_repo_git(session, repo_git) - if not repo: - raise Exception(f"Task with repo_git of {repo_git} but could not be found in Repo table") - - #try to get the collection status if it exists at this point - try: - status = repo.collection_status[0] - time_factor = calculate_date_weight_from_timestamps(repo.repo_added,status.core_data_last_collected) - except IndexError: - time_factor = calculate_date_weight_from_timestamps(repo.repo_added,None) + repo = get_repo_by_repo_git(repo_git) + + if not repo: + raise Exception(f"Task with repo_git of {repo_git} but could not be found in Repo table") + + #try to get the collection status if it exists at this point + try: + status = repo.collection_status[0] + time_factor = calculate_date_weight_from_timestamps(repo.repo_added,status.core_data_last_collected) + except IndexError: + time_factor = calculate_date_weight_from_timestamps(repo.repo_added,None) #Don't go below zero. diff --git a/augur/tasks/gitlab/events_task.py b/augur/tasks/gitlab/events_task.py index a7b886da2d..5c85bc5d84 100644 --- a/augur/tasks/gitlab/events_task.py +++ b/augur/tasks/gitlab/events_task.py @@ -6,11 +6,12 @@ from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler -from augur.tasks.gitlab.gitlab_task_session import GitlabTaskManifest from augur.application.db.data_parse import extract_gitlab_mr_event_data, extract_gitlab_issue_event_data -from augur.tasks.github.util.util import get_owner_repo -from augur.application.db.models import Repo, Issue, IssueEvent, PullRequest, PullRequestEvent -from augur.application.db.util import execute_session_query +from augur.tasks.github.util.util import get_gitlab_repo_identifier +from augur.application.db.models import Issue, IssueEvent, PullRequest, PullRequestEvent, Repo +from augur.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_session +from augur.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth + platform_id = 2 @@ -23,22 +24,21 @@ def collect_gitlab_issue_events(repo_git) -> int: repo_git: the repo url string """ - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) logger = logging.getLogger(collect_gitlab_issue_events.__name__) - with GitlabTaskManifest(logger) as manifest: - augur_db = manifest.augur_db + repo_id = get_repo_by_repo_git(repo_git).repo_id + + key_auth = GitlabRandomKeyAuth(logger) - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id + events = retrieve_all_gitlab_event_data("issue", repo_git, logger, key_auth) - events = retrieve_all_gitlab_event_data("issue", repo_git, logger, manifest.key_auth) + with get_session() as session: if events: logger.info(f"Length of gitlab issue events: {len(events)}") - process_issue_events(events, f"{owner}/{repo}: Gitlab Issue Events task", repo_id, logger, augur_db) + process_issue_events(events, f"{owner}/{repo}: Gitlab Issue Events task", repo_id, logger, session) else: logger.info(f"{owner}/{repo} has no gitlab issue events") @@ -52,23 +52,21 @@ def collect_gitlab_merge_request_events(repo_git) -> int: repo_git: the repo url string """ - - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) logger = logging.getLogger(collect_gitlab_issue_events.__name__) - with GitlabTaskManifest(logger) as manifest: - augur_db = manifest.augur_db + repo_id = get_repo_by_repo_git(repo_git).repo_id - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id + key_auth = GitlabRandomKeyAuth(logger) - events = retrieve_all_gitlab_event_data("merge_request", repo_git, logger, manifest.key_auth) + events = retrieve_all_gitlab_event_data("merge_request", repo_git, logger, key_auth) + + with get_session() as session: if events: logger.info(f"Length of gitlab merge request events: {len(events)}") - process_mr_events(events, f"{owner}/{repo}: Gitlab MR Events task", repo_id, logger, augur_db) + process_mr_events(events, f"{owner}/{repo}: Gitlab MR Events task", repo_id, logger, session) else: logger.info(f"{owner}/{repo} has no gitlab merge request events") @@ -84,11 +82,13 @@ def retrieve_all_gitlab_event_data(gtype, repo_git, logger, key_auth) -> None: key_auth: key auth cache and rotator object """ - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) + + repo_identifier = get_gitlab_repo_identifier(owner, repo) logger.info(f"Collecting gitlab issue events for {owner}/{repo}") - url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/events?target_type={gtype}" + url = f"https://gitlab.com/api/v4/projects/{repo_identifier}/events?target_type={gtype}" events = GitlabApiHandler(key_auth, logger) all_data = [] @@ -110,7 +110,7 @@ def retrieve_all_gitlab_event_data(gtype, repo_git, logger, key_auth) -> None: return all_data -def process_issue_events(events, task_name, repo_id, logger, augur_db): +def process_issue_events(events, task_name, repo_id, logger, session): """ Retrieve only the needed data for mr label data from the api response @@ -119,7 +119,7 @@ def process_issue_events(events, task_name, repo_id, logger, augur_db): task_name: name of the task as well as the repo being processed repo_id: augur id of the repo logger: logging object - augur_db: sqlalchemy db object + session: sqlalchemy db object """ tool_source = "Gitlab issue events task" @@ -130,7 +130,7 @@ def process_issue_events(events, task_name, repo_id, logger, augur_db): # create mapping from issue number to issue id of current issues issue_url_to_id_map = {} - issues = augur_db.session.query(Issue).filter(Issue.repo_id == repo_id).all() + issues = session.query(Issue).filter(Issue.repo_id == repo_id).all() for issue in issues: issue_url_to_id_map[issue.gh_issue_number] = issue.issue_id @@ -153,10 +153,10 @@ def process_issue_events(events, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(issue_event_dicts)} gitlab issue events") issue_event_natural_keys = ["issue_id", "issue_event_src_id"] - augur_db.insert_data(issue_event_dicts, IssueEvent, issue_event_natural_keys) + bulk_insert_dicts(logger, issue_event_dicts, IssueEvent, issue_event_natural_keys) -def process_mr_events(events, task_name, repo_id, logger, augur_db): +def process_mr_events(events, task_name, repo_id, logger, session): """ Retrieve only the needed data for mr events from the api response @@ -180,7 +180,7 @@ def process_mr_events(events, task_name, repo_id, logger, augur_db): # create mapping from mr number to pull request id of current mrs mr_number_to_id_map = {} - mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + mrs = session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() for mr in mrs: mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id @@ -203,6 +203,6 @@ def process_mr_events(events, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(mr_event_dicts)} gitlab mr events") mr_event_natural_keys = ["platform_id", "node_id"] - augur_db.insert_data(mr_event_dicts, PullRequestEvent, mr_event_natural_keys) + bulk_insert_dicts(logger, mr_event_dicts, PullRequestEvent, mr_event_natural_keys) diff --git a/augur/tasks/gitlab/gitlab_api_key_handler.py b/augur/tasks/gitlab/gitlab_api_key_handler.py index c3a76f6ddc..72b0ace148 100644 --- a/augur/tasks/gitlab/gitlab_api_key_handler.py +++ b/augur/tasks/gitlab/gitlab_api_key_handler.py @@ -8,11 +8,9 @@ import random from typing import List -from sqlalchemy.orm import Session from augur.tasks.util.redis_list import RedisList -from augur.application.db.lib import get_value -from sqlalchemy import func +from augur.application.db.lib import get_value, get_worker_oauth_keys class NoValidKeysError(Exception): @@ -23,7 +21,6 @@ class GitlabApiKeyHandler(): """Handles Gitlab API key retrieval from the database and redis Attributes: - session (DatabaseSession): Database connection logger (logging.Logger): Handles all logs oauth_redis_key (str): The key where the gitlab api keys are cached in redis redis_key_list (RedisList): Acts like a python list, and interacts directly with the redis cache @@ -31,9 +28,8 @@ class GitlabApiKeyHandler(): key: (List[str]): List of keys retrieve from database or cache """ - def __init__(self, session: Session, logger): + def __init__(self, logger): - self.session = session self.logger = logger self.oauth_redis_key = "gitlab_oauth_keys_list" @@ -72,15 +68,11 @@ def get_api_keys_from_database(self) -> List[str]: Returns: Github api keys that are in the database """ - from augur.application.db.models import WorkerOauth + keys = get_worker_oauth_keys('gitlab') - select = WorkerOauth.access_token - # randomizing the order at db time - #select.order_by(func.random()) - where = [WorkerOauth.access_token != self.config_key, WorkerOauth.platform == 'gitlab'] + filtered_keys = [item for item in keys if item != self.config_key] - return [key_tuple[0] for key_tuple in self.session.query(select).filter(*where).order_by(func.random()).all()] - #return [key_tuple[0] for key_tuple in self.session.query(select).filter(*where).all()] + return filtered_keys def get_api_keys(self) -> List[str]: @@ -118,6 +110,8 @@ def get_api_keys(self) -> List[str]: if len(keys) == 0: return [] + keys = [key.strip() for key in keys] + valid_keys = [] with httpx.Client() as client: @@ -136,8 +130,9 @@ def get_api_keys(self) -> List[str]: # add all the keys to redis self.redis_key_list.extend(valid_keys) - if not valid_keys: - raise NoValidKeysError("No valid gitlab api keys found in the config or worker oauth table") + # Removed because most people do not collect gitlab and this blows up on startup if they don't have any gitlab keys + # if not valid_keys: + # raise NoValidKeysError("No valid gitlab api keys found in the config or worker oauth table") # shuffling the keys so not all processes get the same keys in the same order diff --git a/augur/tasks/gitlab/gitlab_random_key_auth.py b/augur/tasks/gitlab/gitlab_random_key_auth.py index b2afded3ae..3269d1ec3e 100644 --- a/augur/tasks/gitlab/gitlab_random_key_auth.py +++ b/augur/tasks/gitlab/gitlab_random_key_auth.py @@ -1,7 +1,4 @@ """Defines the GitlabRandomKeyAuth class""" - -from sqlalchemy.orm import Session - from augur.tasks.util.random_key_auth import RandomKeyAuth from augur.tasks.gitlab.gitlab_api_key_handler import GitlabApiKeyHandler @@ -11,12 +8,12 @@ class GitlabRandomKeyAuth(RandomKeyAuth): gitlab collections can have a class randomly selects an api key for each request """ - def __init__(self, session: Session, logger): + def __init__(self, logger): """Creates a GitlabRandomKeyAuth object and initializes the RandomKeyAuth parent class""" # gets the gitlab api keys from the database via the GitlabApiKeyHandler - gitlab_api_keys = GitlabApiKeyHandler(session, logger).keys + gitlab_api_keys = GitlabApiKeyHandler(logger).keys if not gitlab_api_keys: print("Failed to find github api keys. This is usually because your key has expired") @@ -24,4 +21,4 @@ def __init__(self, session: Session, logger): header_name = "Authorization" key_format = "Bearer {0}" - super().__init__(gitlab_api_keys, header_name, session.logger, key_format) \ No newline at end of file + super().__init__(gitlab_api_keys, header_name, logger, key_format) \ No newline at end of file diff --git a/augur/tasks/gitlab/gitlab_task_session.py b/augur/tasks/gitlab/gitlab_task_session.py index 0892087d22..3f65f89f42 100644 --- a/augur/tasks/gitlab/gitlab_task_session.py +++ b/augur/tasks/gitlab/gitlab_task_session.py @@ -7,35 +7,6 @@ from augur.application.db.session import DatabaseSession from augur.application.db import get_engine -class GitlabTaskManifest: - """ - Manifest object that represents the state and common elements of - the specified task. GitLab version for the GitLab tasks. - - Attributes: - augur_db: sqlalchemy db object - key_auth: GitLab specific key auth retrieval collection - logger: logging object - platform_id: GitLab specific platform id (github is 1) - """ - - def __init__(self, logger): - - engine = get_engine() - - self.augur_db = DatabaseSession(logger, engine) - self.key_auth = GitlabRandomKeyAuth(self.augur_db.session, logger) - self.logger = logger - self.platform_id = 2 - - def __enter__(self): - - return self - - def __exit__(self, exception_type, exception_value, exception_traceback): - - self.augur_db.close() - class GitlabTaskSession(DatabaseSession): """ORM session used in gitlab tasks. This class adds the platform_id and the gitlab key authentication class, @@ -51,6 +22,6 @@ def __init__(self, logger: Logger, engine=None): super().__init__(logger, engine=engine) - self.oauths = GitlabRandomKeyAuth(self, logger) + self.oauths = GitlabRandomKeyAuth(logger) self.platform_id = 2 diff --git a/augur/tasks/gitlab/issues_task.py b/augur/tasks/gitlab/issues_task.py index 6159a6bb0a..8a1415a7de 100644 --- a/augur/tasks/gitlab/issues_task.py +++ b/augur/tasks/gitlab/issues_task.py @@ -7,12 +7,12 @@ from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler -from augur.tasks.gitlab.gitlab_task_session import GitlabTaskManifest from augur.application.db.data_parse import extract_needed_issue_data_from_gitlab_issue, extract_needed_gitlab_issue_label_data, extract_needed_gitlab_issue_assignee_data, extract_needed_gitlab_issue_message_ref_data, extract_needed_gitlab_message_data, extract_needed_gitlab_contributor_data -from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts -from augur.application.db.models import Issue, IssueLabel, IssueAssignee, IssueMessageRef, Message, Repo, Contributor -from augur.application.db.util import execute_session_query +from augur.tasks.github.util.util import get_gitlab_repo_identifier, add_key_value_pair_to_dicts +from augur.application.db.models import Issue, IssueLabel, IssueAssignee, IssueMessageRef, Message, Contributor, Repo from augur.tasks.util.worker_util import remove_duplicate_dicts +from augur.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_session, batch_insert_contributors +from augur.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth platform_id = 2 @@ -26,28 +26,24 @@ def collect_gitlab_issues(repo_git : str) -> int: """ logger = logging.getLogger(collect_gitlab_issues.__name__) - with GitlabTaskManifest(logger) as manifest: - augur_db = manifest.augur_db + repo_id = get_repo_by_repo_git(repo_git).repo_id - try: - - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id + key_auth = GitlabRandomKeyAuth(logger) - owner, repo = get_owner_repo(repo_git) - - issue_data = retrieve_all_gitlab_issue_data(repo_git, logger, manifest.key_auth) + try: + owner, repo = Repo.parse_gitlab_repo_url(repo_git) + + issue_data = retrieve_all_gitlab_issue_data(repo_git, logger, key_auth) - if issue_data: - issue_ids = process_issues(issue_data, f"{owner}/{repo}: Gitlab Issue task", repo_id, logger, augur_db) + if issue_data: + issue_ids = process_issues(issue_data, f"{owner}/{repo}: Gitlab Issue task", repo_id, logger) - return issue_ids - else: - logger.info(f"{owner}/{repo} has no issues") - return [] - except Exception as e: + return issue_ids + else: + logger.info(f"{owner}/{repo} has no issues") + return [] + except Exception as e: logger.error(f"Could not collect gitlab issues for repo {repo_git}\n Reason: {e} \n Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}") return -1 @@ -61,11 +57,13 @@ def retrieve_all_gitlab_issue_data(repo_git, logger, key_auth) -> None: key_auth: key auth cache and rotator object """ - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) + + repo_identifier = get_gitlab_repo_identifier(owner, repo) logger.info(f"Collecting gitlab issues for {owner}/{repo}") - url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/issues?with_labels_details=True" + url = f"https://gitlab.com/api/v4/projects/{repo_identifier}/issues?with_labels_details=True" issues = GitlabApiHandler(key_auth, logger) all_data = [] @@ -87,7 +85,7 @@ def retrieve_all_gitlab_issue_data(repo_git, logger, key_auth) -> None: return all_data -def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: +def process_issues(issues, task_name, repo_id, logger) -> None: """ Retrieve only the needed data for issues from the api response @@ -96,7 +94,7 @@ def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: task_name: name of the task as well as the repo being processed repo_id: augur id of the repo logger: logging object - augur_db: sqlalchemy db object + session: sqlalchemy db object """ # get repo_id or have it passed @@ -142,14 +140,14 @@ def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: # insert contributors from these issues logger.info(f"{task_name}: Inserting {len(contributors)} contributors") - augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) + batch_insert_contributors(logger, contributors) logger.info(f"{task_name}: Inserting {len(issue_dicts)} gitlab issues") issue_natural_keys = ["repo_id", "gh_issue_id"] issue_string_columns = ["issue_title", "issue_body"] issue_return_columns = ["gh_issue_id", "issue_id"] - issue_return_data = augur_db.insert_data(issue_dicts, Issue, issue_natural_keys, return_columns=issue_return_columns, string_fields=issue_string_columns) + issue_return_data = bulk_insert_dicts(logger, issue_dicts, Issue, issue_natural_keys, return_columns=issue_return_columns, string_fields=issue_string_columns) issue_label_dicts = [] issue_assignee_dicts = [] @@ -176,12 +174,12 @@ def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: # we are using label_src_id and issue_id to determine if the label is already in the database. issue_label_natural_keys = ['label_src_id', 'issue_id'] issue_label_string_fields = ["label_text", "label_description"] - augur_db.insert_data(issue_label_dicts, IssueLabel, + bulk_insert_dicts(logger, issue_label_dicts, IssueLabel, issue_label_natural_keys, string_fields=issue_label_string_fields) # inserting issue assignees issue_assignee_natural_keys = ['issue_assignee_src_id', 'issue_id'] - augur_db.insert_data(issue_assignee_dicts, IssueAssignee, issue_assignee_natural_keys) + bulk_insert_dicts(logger, issue_assignee_dicts, IssueAssignee, issue_assignee_natural_keys) return issue_ids @@ -211,22 +209,21 @@ def collect_gitlab_issue_comments(issue_ids, repo_git) -> int: repo_git: repo url """ - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) logger = logging.getLogger(collect_gitlab_issues.__name__) - with GitlabTaskManifest(logger) as manifest: - augur_db = manifest.augur_db + repo_id = get_repo_by_repo_git(repo_git).repo_id - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id + key_auth = GitlabRandomKeyAuth(logger) - comments = retrieve_all_gitlab_issue_comments(manifest.key_auth, logger, issue_ids, repo_git) + comments = retrieve_all_gitlab_issue_comments(key_auth, logger, issue_ids, repo_git) + + with get_session() as session: if comments: logger.info(f"Length of comments: {len(comments)}") - process_gitlab_issue_messages(comments, f"{owner}/{repo}: Gitlab issue messages task", repo_id, logger, augur_db) + process_gitlab_issue_messages(comments, f"{owner}/{repo}: Gitlab issue messages task", repo_id, logger, session) else: logger.info(f"{owner}/{repo} has no gitlab issue comments") @@ -242,7 +239,9 @@ def retrieve_all_gitlab_issue_comments(key_auth, logger, issue_ids, repo_git): repo_git: repo url """ - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) + + repo_identifier = get_gitlab_repo_identifier(owner, repo) all_comments = {} issue_count = len(issue_ids) @@ -254,7 +253,7 @@ def retrieve_all_gitlab_issue_comments(key_auth, logger, issue_ids, repo_git): logger.info(f"Collecting {owner}/{repo} gitlab issue comments for issue {index} of {issue_count}") - url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/issues/{id}/notes" + url = f"https://gitlab.com/api/v4/projects/{repo_identifier}/issues/{id}/notes" for page_data, _ in comments.iter_pages(url): @@ -271,7 +270,7 @@ def retrieve_all_gitlab_issue_comments(key_auth, logger, issue_ids, repo_git): return all_comments -def process_gitlab_issue_messages(data, task_name, repo_id, logger, augur_db): +def process_gitlab_issue_messages(data, task_name, repo_id, logger, session): """ Retrieve only the needed data for issue messages from the api response @@ -280,7 +279,7 @@ def process_gitlab_issue_messages(data, task_name, repo_id, logger, augur_db): task_name: name of the task as well as the repo being processed repo_id: augur id of the repo logger: logging object - augur_db: sqlalchemy db object + session: sqlalchemy db object """ tool_source = "Gitlab issue comments" @@ -289,7 +288,7 @@ def process_gitlab_issue_messages(data, task_name, repo_id, logger, augur_db): # create mapping from mr number to pull request id of current mrs issue_number_to_id_map = {} - issues = augur_db.session.query(Issue).filter(Issue.repo_id == repo_id).all() + issues = session.query(Issue).filter(Issue.repo_id == repo_id).all() for issue in issues: issue_number_to_id_map[issue.gh_issue_number] = issue.issue_id @@ -308,7 +307,7 @@ def process_gitlab_issue_messages(data, task_name, repo_id, logger, augur_db): for message in messages: - message, contributor = process_gitlab_comment_contributors(message, tool_source, tool_version, data_source) + message, contributor = process_gitlab_issue_comment_contributors(message, tool_source, tool_version, data_source) if contributor: contributors.append(contributor) @@ -320,19 +319,19 @@ def process_gitlab_issue_messages(data, task_name, repo_id, logger, augur_db): } message_dicts.append( - extract_needed_gitlab_message_data(message, platform_id, tool_source, tool_version, data_source) + extract_needed_gitlab_message_data(message, platform_id, repo_id, tool_source, tool_version, data_source) ) contributors = remove_duplicate_dicts(contributors) logger.info(f"{task_name}: Inserting {len(contributors)} contributors") - augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) + batch_insert_contributors(logger, contributors) logger.info(f"{task_name}: Inserting {len(message_dicts)} messages") - message_natural_keys = ["platform_msg_id"] + message_natural_keys = ["platform_msg_id", "pltfrm_id"] message_return_columns = ["msg_id", "platform_msg_id"] message_string_fields = ["msg_text"] - message_return_data = augur_db.insert_data(message_dicts, Message, message_natural_keys, + message_return_data = bulk_insert_dicts(logger, message_dicts, Message, message_natural_keys, return_columns=message_return_columns, string_fields=message_string_fields) issue_message_ref_dicts = [] @@ -349,10 +348,10 @@ def process_gitlab_issue_messages(data, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(issue_message_ref_dicts)} gitlab issue messages ref rows") issue_message_ref_natural_keys = ["issue_id", "issue_msg_ref_src_comment_id"] - augur_db.insert_data(issue_message_ref_dicts, IssueMessageRef, issue_message_ref_natural_keys) + bulk_insert_dicts(logger, issue_message_ref_dicts, IssueMessageRef, issue_message_ref_natural_keys) -def process_gitlab_comment_contributors(message, tool_source, tool_version, data_source): +def process_gitlab_issue_comment_contributors(message, tool_source, tool_version, data_source): contributor = extract_needed_gitlab_contributor_data(message["author"], tool_source, tool_version, data_source) if contributor: diff --git a/augur/tasks/gitlab/merge_request_task.py b/augur/tasks/gitlab/merge_request_task.py index 29ee7a54bb..7a3b006184 100644 --- a/augur/tasks/gitlab/merge_request_task.py +++ b/augur/tasks/gitlab/merge_request_task.py @@ -3,11 +3,12 @@ from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler -from augur.tasks.gitlab.gitlab_task_session import GitlabTaskManifest -from augur.application.db.data_parse import extract_needed_pr_data_from_gitlab_merge_request, extract_needed_merge_request_assignee_data, extract_needed_mr_label_data, extract_needed_mr_reviewer_data, extract_needed_mr_commit_data, extract_needed_mr_file_data, extract_needed_mr_metadata, extract_needed_gitlab_mr_message_ref_data, extract_needed_gitlab_message_data -from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts -from augur.application.db.models import PullRequest, PullRequestLabel, PullRequestMeta, PullRequestCommit, PullRequestFile, PullRequestMessageRef, Repo, Message -from augur.application.db.util import execute_session_query +from augur.application.db.data_parse import extract_needed_pr_data_from_gitlab_merge_request, extract_needed_merge_request_assignee_data, extract_needed_mr_label_data, extract_needed_mr_reviewer_data, extract_needed_mr_commit_data, extract_needed_mr_file_data, extract_needed_mr_metadata, extract_needed_gitlab_mr_message_ref_data, extract_needed_gitlab_message_data, extract_needed_gitlab_contributor_data +from augur.tasks.github.util.util import get_gitlab_repo_identifier, add_key_value_pair_to_dicts +from augur.application.db.models import PullRequest, PullRequestLabel, PullRequestMeta, PullRequestCommit, PullRequestFile, PullRequestMessageRef, Repo, Message, Contributor, PullRequestAssignee +from augur.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth +from augur.tasks.util.worker_util import remove_duplicate_dicts +from augur.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_session, batch_insert_contributors platform_id = 2 @@ -23,23 +24,21 @@ def collect_gitlab_merge_requests(repo_git: str) -> int: logger = logging.getLogger(collect_gitlab_merge_requests.__name__) - with GitlabTaskManifest(logger) as manifest: + repo_id = get_repo_by_repo_git(repo_git).repo_id - augur_db = manifest.augur_db + owner, repo = Repo.parse_gitlab_repo_url(repo_git) - repo_id = augur_db.session.query(Repo).filter( - Repo.repo_git == repo_git).one().repo_id + key_auth = GitlabRandomKeyAuth(logger) - owner, repo = get_owner_repo(repo_git) - mr_data = retrieve_all_mr_data(repo_git, logger, manifest.key_auth) + mr_data = retrieve_all_mr_data(repo_git, logger, key_auth) - if mr_data: - mr_ids = process_merge_requests(mr_data, f"{owner}/{repo}: Mr task", repo_id, logger, augur_db) + if mr_data: + mr_ids = process_merge_requests(mr_data, f"{owner}/{repo}: Mr task", repo_id, logger) - return mr_ids - else: - logger.info(f"{owner}/{repo} has no merge requests") - return [] + return mr_ids + else: + logger.info(f"{owner}/{repo} has no merge requests") + return [] def retrieve_all_mr_data(repo_git: str, logger, key_auth) -> None: @@ -52,11 +51,13 @@ def retrieve_all_mr_data(repo_git: str, logger, key_auth) -> None: key_auth: key auth cache and rotator object """ - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) + + repo_identifier = get_gitlab_repo_identifier(owner, repo) logger.info(f"Collecting pull requests for {owner}/{repo}") - url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests?with_labels_details=True" + url = f"https://gitlab.com/api/v4/projects/{repo_identifier}/merge_requests?with_labels_details=True" mrs = GitlabApiHandler(key_auth, logger) all_data = [] @@ -79,7 +80,7 @@ def retrieve_all_mr_data(repo_git: str, logger, key_auth) -> None: return all_data -def process_merge_requests(data, task_name, repo_id, logger, augur_db): +def process_merge_requests(data, task_name, repo_id, logger): """ Retrieve only the needed data for mr label data from the api response @@ -88,7 +89,6 @@ def process_merge_requests(data, task_name, repo_id, logger, augur_db): task_name: name of the task as well as the repo being processed repo_id: augur id of the repo logger: logging object - augur_db: sqlalchemy db object Returns: List of parsed MR ids. @@ -99,12 +99,17 @@ def process_merge_requests(data, task_name, repo_id, logger, augur_db): data_source = "Gitlab API" merge_requests = [] + contributors = [] mr_ids = [] mr_mapping_data = {} for mr in data: mr_ids.append(mr["iid"]) + mr, contributor_data = process_mr_contributors(mr, tool_source, tool_version, data_source) + + contributors += contributor_data + merge_requests.append(extract_needed_pr_data_from_gitlab_merge_request(mr, repo_id, tool_source, tool_version)) assignees = extract_needed_merge_request_assignee_data(mr["assignees"], repo_id, tool_source, tool_version, data_source) @@ -117,11 +122,16 @@ def process_merge_requests(data, task_name, repo_id, logger, augur_db): "labels": labels } + contributors = remove_duplicate_dicts(contributors) + + logger.info(f"{task_name}: Inserting {len(contributors)} contributors") + batch_insert_contributors(logger, contributors) + logger.info(f"{task_name}: Inserting mrs of length: {len(merge_requests)}") pr_natural_keys = ["repo_id", "pr_src_id"] pr_string_fields = ["pr_src_title", "pr_body"] pr_return_columns = ["pull_request_id", "pr_src_id"] - pr_return_data = augur_db.insert_data(merge_requests, PullRequest, pr_natural_keys, return_columns=pr_return_columns, string_fields=pr_string_fields) + pr_return_data = bulk_insert_dicts(logger, merge_requests, PullRequest, pr_natural_keys, return_columns=pr_return_columns, string_fields=pr_string_fields) mr_assignee_dicts = [] @@ -142,13 +152,12 @@ def process_merge_requests(data, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting other pr data of lengths: Labels: {len(mr_label_dicts)} - Assignees: {len(mr_assignee_dicts)}") - # TODO: Setup unique key on asignees with a value of ('cntrb_id', 'pull_request_id') and add 'cntrb_id' to assingee data - # mr_assignee_natural_keys = ['pr_assignee_src_id', 'pull_request_id'] - # augur_db.insert_data(mr_assignee_dicts, PullRequestAssignee, mr_assignee_natural_keys) + mr_assignee_natural_keys = ['pr_assignee_src_id', 'pull_request_id'] + bulk_insert_dicts(logger, mr_assignee_dicts, PullRequestAssignee, mr_assignee_natural_keys) pr_label_natural_keys = ['pr_src_id', 'pull_request_id'] pr_label_string_fields = ["pr_src_description"] - augur_db.insert_data(mr_label_dicts, PullRequestLabel, pr_label_natural_keys, string_fields=pr_label_string_fields) + bulk_insert_dicts(logger, mr_label_dicts, PullRequestLabel, pr_label_natural_keys, string_fields=pr_label_string_fields) return mr_ids @@ -164,28 +173,29 @@ def collect_merge_request_comments(mr_ids, repo_git) -> int: repo_git: the repo url string """ - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) logger = logging.getLogger(collect_merge_request_comments.__name__) - with GitlabTaskManifest(logger) as manifest: - augur_db = manifest.augur_db + repo_identifier = get_gitlab_repo_identifier(owner, repo) + + repo_id = get_repo_by_repo_git(repo_git).repo_id + + key_auth = GitlabRandomKeyAuth(logger) - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id + url = "https://gitlab.com/api/v4/projects/{repo_identifier}/merge_requests/{id}/notes".format(repo_identifier=repo_identifier, id="{id}") + comments = retrieve_merge_request_data(mr_ids, url, "comments", owner, repo, key_auth, logger, response_type="list") - url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/notes".format(owner=owner, repo=repo, id="{id}") - comments = retrieve_merge_request_data(mr_ids, url, "comments", owner, repo, manifest.key_auth, logger, response_type="list") + with get_session() as session: if comments: logger.info(f"Length of merge request comments: {len(comments)}") - process_gitlab_mr_messages(comments, f"{owner}/{repo}: Gitlab mr messages task", repo_id, logger, augur_db) + process_gitlab_mr_messages(comments, f"{owner}/{repo}: Gitlab mr messages task", repo_id, logger, session) else: logger.info(f"{owner}/{repo} has no gitlab merge request comments") -def process_gitlab_mr_messages(data, task_name, repo_id, logger, augur_db): +def process_gitlab_mr_messages(data, task_name, repo_id, logger, session): """ Retrieve only the needed data for mr label data from the api response @@ -194,7 +204,7 @@ def process_gitlab_mr_messages(data, task_name, repo_id, logger, augur_db): task_name: name of the task as well as the repo being processed repo_id: augur id of the repo logger: logging object - augur_db: sqlalchemy db object + session: sqlalchemy db object """ tool_source = "Gitlab mr comments" @@ -203,11 +213,12 @@ def process_gitlab_mr_messages(data, task_name, repo_id, logger, augur_db): # create mapping from mr number to pull request id of current mrs mr_number_to_id_map = {} - mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + mrs = session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() for mr in mrs: mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id message_dicts = [] + contributors = [] message_ref_mapping_data = {} for id, messages in data.items(): @@ -221,6 +232,11 @@ def process_gitlab_mr_messages(data, task_name, repo_id, logger, augur_db): for message in messages: + message, contributor = process_gitlab_mr_comment_contributors(message, tool_source, tool_version, data_source) + + if contributor: + contributors.append(contributor) + mr_message_ref_data = extract_needed_gitlab_mr_message_ref_data(message, pull_request_id, repo_id, tool_source, tool_version, data_source) message_ref_mapping_data[message["id"]] = { @@ -228,15 +244,19 @@ def process_gitlab_mr_messages(data, task_name, repo_id, logger, augur_db): } message_dicts.append( - extract_needed_gitlab_message_data(message, platform_id, tool_source, tool_version, data_source) + extract_needed_gitlab_message_data(message, platform_id, repo_id, tool_source, tool_version, data_source) ) + contributors = remove_duplicate_dicts(contributors) - logger.info(f"{task_name}: Inserting {len(message_dicts)} messages") - message_natural_keys = ["platform_msg_id"] + logger.info(f"{task_name}: Inserting {len(contributors)} mr message contributors") + batch_insert_contributors(logger, contributors) + + logger.info(f"{task_name}: Inserting {len(message_dicts)} mr messages") + message_natural_keys = ["platform_msg_id", "pltfrm_id"] message_return_columns = ["msg_id", "platform_msg_id"] message_string_fields = ["msg_text"] - message_return_data = augur_db.insert_data(message_dicts, Message, message_natural_keys, + message_return_data = bulk_insert_dicts(logger, message_dicts, Message, message_natural_keys, return_columns=message_return_columns, string_fields=message_string_fields) mr_message_ref_dicts = [] @@ -253,7 +273,7 @@ def process_gitlab_mr_messages(data, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(mr_message_ref_dicts)} mr messages ref rows") mr_message_ref_natural_keys = ["pull_request_id", "pr_message_ref_src_comment_id"] - augur_db.insert_data(mr_message_ref_dicts, PullRequestMessageRef, mr_message_ref_natural_keys) + bulk_insert_dicts(logger, mr_message_ref_dicts, PullRequestMessageRef, mr_message_ref_natural_keys) @celery.task(base=AugurCoreRepoCollectionTask) @@ -266,27 +286,28 @@ def collect_merge_request_metadata(mr_ids, repo_git) -> int: repo_git: the repo url string """ - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) + + repo_identifier = get_gitlab_repo_identifier(owner, repo) logger = logging.getLogger(collect_merge_request_metadata.__name__) - with GitlabTaskManifest(logger) as manifest: - augur_db = manifest.augur_db + repo_id = get_repo_by_repo_git(repo_git).repo_id - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id + key_auth = GitlabRandomKeyAuth(logger) - url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}".format(owner=owner, repo=repo, id="{id}") - metadata_list = retrieve_merge_request_data(mr_ids, url, "metadata", owner, repo, manifest.key_auth, logger, response_type="dict") + url = "https://gitlab.com/api/v4/projects/{repo_identifier}/merge_requests/{id}".format(repo_identifier=repo_identifier, id="{id}") + metadata_list = retrieve_merge_request_data(mr_ids, url, "metadata", owner, repo, key_auth, logger, response_type="dict") + + with get_session() as session: if metadata_list: logger.info(f"Length of merge request metadata: {len(metadata_list)}") - process_mr_metadata(metadata_list, f"{owner}/{repo}: Mr metadata task", repo_id, logger, augur_db) + process_mr_metadata(metadata_list, f"{owner}/{repo}: Mr metadata task", repo_id, logger, session) else: logger.info(f"{owner}/{repo} has no gitlab merge request metadata") -def process_mr_metadata(data, task_name, repo_id, logger, augur_db): +def process_mr_metadata(data, task_name, repo_id, logger, session): """ Retrieve only the needed data for mr label data from the api response @@ -295,7 +316,7 @@ def process_mr_metadata(data, task_name, repo_id, logger, augur_db): task_name: name of the task as well as the repo being processed repo_id: augur id of the repo logger: logging object - augur_db: sqlalchemy db object + session: sqlalchemy db object """ tool_source = "Mr Metadata Task" @@ -304,7 +325,7 @@ def process_mr_metadata(data, task_name, repo_id, logger, augur_db): # create mapping from mr number to pull request id of current mrs mr_number_to_id_map = {} - mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + mrs = session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() for mr in mrs: mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id @@ -319,7 +340,7 @@ def process_mr_metadata(data, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(all_metadata)} merge request metadata") pr_metadata_natural_keys = ['pull_request_id', 'pr_head_or_base', 'pr_sha'] - augur_db.insert_data(all_metadata, PullRequestMeta, pr_metadata_natural_keys) + bulk_insert_dicts(logger, all_metadata, PullRequestMeta, pr_metadata_natural_keys) @celery.task(base=AugurCoreRepoCollectionTask) @@ -332,27 +353,28 @@ def collect_merge_request_reviewers(mr_ids, repo_git) -> int: repo_git: the repo url string """ - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) + + repo_identifier = get_gitlab_repo_identifier(owner, repo) logger = logging.getLogger(collect_merge_request_reviewers.__name__) - with GitlabTaskManifest(logger) as manifest: - augur_db = manifest.augur_db + repo_id = get_repo_by_repo_git(repo_git).repo_id - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id + key_auth = GitlabRandomKeyAuth(logger) - url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/approvals".format(owner=owner, repo=repo, id="{id}") - reviewers = retrieve_merge_request_data(mr_ids, url, "reviewers", owner, repo, manifest.key_auth, logger, response_type="dict") + url = "https://gitlab.com/api/v4/projects/{repo_identifier}/merge_requests/{id}/approvals".format(repo_identifier=repo_identifier, id="{id}") + reviewers = retrieve_merge_request_data(mr_ids, url, "reviewers", owner, repo, key_auth, logger, response_type="dict") + + with get_session() as session: if reviewers: logger.info(f"Length of merge request reviewers: {len(reviewers)}") - process_mr_reviewers(reviewers, f"{owner}/{repo}: Mr reviewer task", repo_id, logger, augur_db) + process_mr_reviewers(reviewers, f"{owner}/{repo}: Mr reviewer task", repo_id, logger, session) else: logger.info(f"{owner}/{repo} has no gitlab merge request reviewers") -def process_mr_reviewers(data, task_name, repo_id, logger, augur_db): +def process_mr_reviewers(data, task_name, repo_id, logger, session): """ Retrieve only the needed data for mr Reviewer data from the api response @@ -360,7 +382,7 @@ def process_mr_reviewers(data, task_name, repo_id, logger, augur_db): data: List of dictionaries of mr Reviewer data repo_id: augur id of the repo logger: logging object - augur_db: sqlalchemy db object + session: sqlalchemy db object """ tool_source = "Mr Reviewer Task" @@ -371,7 +393,7 @@ def process_mr_reviewers(data, task_name, repo_id, logger, augur_db): # create mapping from mr number to pull request id of current mrs mr_number_to_id_map = {} - mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + mrs = session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() for mr in mrs: mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id @@ -386,7 +408,7 @@ def process_mr_reviewers(data, task_name, repo_id, logger, augur_db): # TODO: Need to add unique key with pull_request_id and cntrb_id to insert gitlab reviewers # pr_reviewer_natural_keys = ["pull_request_id", "cntrb_id"] - # augur_db.insert_data(all_reviewers, PullRequestReviewer, pr_reviewer_natural_keys) + # bulk_insert_dicts(all_reviewers, PullRequestReviewer, pr_reviewer_natural_keys) @@ -400,28 +422,29 @@ def collect_merge_request_commits(mr_ids, repo_git) -> int: repo_git: the repo url string """ - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) + + repo_identifier = get_gitlab_repo_identifier(owner, repo) logger = logging.getLogger(collect_merge_request_commits.__name__) - with GitlabTaskManifest(logger) as manifest: - augur_db = manifest.augur_db + repo_id = get_repo_by_repo_git(repo_git).repo_id - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id + key_auth = GitlabRandomKeyAuth(logger) - url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/commits".format(owner=owner, repo=repo, id="{id}") - commits = retrieve_merge_request_data(mr_ids, url, "commits", owner, repo, manifest.key_auth, logger, response_type="list") + url = "https://gitlab.com/api/v4/projects/{repo_identifier}/merge_requests/{id}/commits".format(repo_identifier=repo_identifier, id="{id}") + commits = retrieve_merge_request_data(mr_ids, url, "commits", owner, repo, key_auth, logger, response_type="list") + + with get_session() as session: if commits: logger.info(f"Length of merge request commits: {len(commits)}") - process_mr_commits(commits, f"{owner}/{repo}: Mr commit task", repo_id, logger, augur_db) + process_mr_commits(commits, f"{owner}/{repo}: Mr commit task", repo_id, logger, session) else: logger.info(f"{owner}/{repo} has no gitlab merge request commits") -def process_mr_commits(data, task_name, repo_id, logger, augur_db): +def process_mr_commits(data, task_name, repo_id, logger, session): """ Retrieve only the needed data for mr commits from the api response @@ -430,7 +453,7 @@ def process_mr_commits(data, task_name, repo_id, logger, augur_db): task_name: name of the task as well as the repo being processed repo_id: augur id of the repo logger: logging object - augur_db: sqlalchemy db object + session: sqlalchemy db object """ tool_source = "Mr Commit Task" @@ -439,7 +462,7 @@ def process_mr_commits(data, task_name, repo_id, logger, augur_db): # create mapping from mr number to pull request id of current mrs mr_number_to_id_map = {} - mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + mrs = session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() for mr in mrs: mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id @@ -455,7 +478,7 @@ def process_mr_commits(data, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(all_commits)} merge request commits") pr_commits_natural_keys = ["pull_request_id", "repo_id", "pr_cmt_sha"] - augur_db.insert_data(all_commits,PullRequestCommit,pr_commits_natural_keys) + bulk_insert_dicts(logger, all_commits,PullRequestCommit,pr_commits_natural_keys) @@ -469,27 +492,28 @@ def collect_merge_request_files(mr_ids, repo_git) -> int: repo_git: the repo url string """ - owner, repo = get_owner_repo(repo_git) - logger = logging.getLogger(collect_merge_request_files.__name__) - with GitlabTaskManifest(logger) as manifest: - augur_db = manifest.augur_db + owner, repo = Repo.parse_gitlab_repo_url(repo_git) + + repo_identifier = get_gitlab_repo_identifier(owner, repo) + + repo_id = get_repo_by_repo_git(repo_git).repo_id - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id + key_auth = GitlabRandomKeyAuth(logger) - url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/changes".format(owner=owner, repo=repo, id="{id}") - files = retrieve_merge_request_data(mr_ids, url, "files", owner, repo, manifest.key_auth, logger, response_type="dict") + url = "https://gitlab.com/api/v4/projects/{repo_identifier}/merge_requests/{id}/changes".format(repo_identifier=repo_identifier, id="{id}") + files = retrieve_merge_request_data(mr_ids, url, "files", owner, repo, key_auth, logger, response_type="dict") + + with get_session() as session: if files: logger.info(f"Length of merge request files: {len(files)}") - process_mr_files(files, f"{owner}/{repo}: Mr files task", repo_id, logger, augur_db) + process_mr_files(files, f"{owner}/{repo}: Mr files task", repo_id, logger, session) else: logger.info(f"{owner}/{repo} has no gitlab merge request files") -def process_mr_files(data, task_name, repo_id, logger, augur_db): +def process_mr_files(data, task_name, repo_id, logger, session): tool_source = "Mr files Task" tool_version = "2.0" @@ -497,7 +521,7 @@ def process_mr_files(data, task_name, repo_id, logger, augur_db): # create mapping from mr number to pull request id of current mrs mr_number_to_id_map = {} - mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + mrs = session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() for mr in mrs: mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id @@ -510,7 +534,7 @@ def process_mr_files(data, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(all_files)} merge request files") pr_file_natural_keys = ["pull_request_id", "repo_id", "pr_file_path"] - augur_db.insert_data(all_files, PullRequestFile, pr_file_natural_keys) + bulk_insert_dicts(logger, all_files, PullRequestFile, pr_file_natural_keys) def retrieve_merge_request_data(ids, url, name, owner, repo, key_auth, logger, response_type): @@ -560,3 +584,30 @@ def retrieve_merge_request_data(ids, url, name, owner, repo, key_auth, logger, r index += 1 return all_data + + +def process_mr_contributors(mr, tool_source, tool_version, data_source): + + contributors = [] + + issue_cntrb = extract_needed_gitlab_contributor_data(mr["author"], tool_source, tool_version, data_source) + mr["cntrb_id"] = issue_cntrb["cntrb_id"] + contributors.append(issue_cntrb) + + for assignee in mr["assignees"]: + + issue_assignee_cntrb = extract_needed_gitlab_contributor_data(assignee, tool_source, tool_version, data_source) + assignee["cntrb_id"] = issue_assignee_cntrb["cntrb_id"] + contributors.append(issue_assignee_cntrb) + + return mr, contributors + +def process_gitlab_mr_comment_contributors(message, tool_source, tool_version, data_source): + + contributor = extract_needed_gitlab_contributor_data(message["author"], tool_source, tool_version, data_source) + if contributor: + message["cntrb_id"] = contributor["cntrb_id"] + else: + message["cntrb_id"] = None + + return message, contributor \ No newline at end of file diff --git a/augur/tasks/init/celery_app.py b/augur/tasks/init/celery_app.py index a18284186c..04b07bb9a7 100644 --- a/augur/tasks/init/celery_app.py +++ b/augur/tasks/init/celery_app.py @@ -13,8 +13,8 @@ from augur.application.logs import TaskLogConfig, AugurLogger from augur.application.db.session import DatabaseSession -from augur.application.db.engine import DatabaseEngine from augur.application.db import get_engine +from augur.application.db.lib import get_session from augur.application.config import AugurConfig from augur.tasks.init import get_redis_conn_values, get_rabbitmq_conn_string from augur.application.db.models import Repo @@ -26,18 +26,19 @@ 'augur.tasks.data_analysis', 'augur.tasks.util.collection_util'] -github_tasks = ['augur.tasks.github.contributors.tasks', - 'augur.tasks.github.issues.tasks', +github_tasks = ['augur.tasks.github.contributors', + 'augur.tasks.github.issues', 'augur.tasks.github.pull_requests.tasks', - 'augur.tasks.github.events.tasks', - 'augur.tasks.github.messages.tasks', + 'augur.tasks.github.events', + 'augur.tasks.github.messages', 'augur.tasks.github.facade_github.tasks', 'augur.tasks.github.releases.tasks', 'augur.tasks.github.repo_info.tasks', 'augur.tasks.github.detect_move.tasks', 'augur.tasks.github.pull_requests.files_model.tasks', 'augur.tasks.github.pull_requests.commits_model.tasks', - 'augur.tasks.github.traffic.tasks'] + 'augur.tasks.github.traffic', + 'augur.tasks.github.util.populate_repo_src_id'] gitlab_tasks = ['augur.tasks.gitlab.merge_request_task', 'augur.tasks.gitlab.issues_task', @@ -83,7 +84,7 @@ def augur_handle_task_failure(self,exc,task_id,repo_git,logger_name,collection_h logger.error(f"Task {task_id} raised exception: {exc}\n Traceback: {''.join(traceback.format_exception(None, exc, exc.__traceback__))}") - with DatabaseSession(logger,engine) as session: + with get_session() as session: logger.info(f"Repo git: {repo_git}") repo = session.query(Repo).filter(Repo.repo_git == repo_git).one() @@ -200,9 +201,10 @@ def setup_periodic_tasks(sender, **kwargs): The tasks so that they are grouped by the module they are defined in """ from celery.schedules import crontab - from augur.tasks.start_tasks import augur_collection_monitor, augur_collection_update_weights - from augur.tasks.start_tasks import non_repo_domain_tasks, retry_errored_repos + from augur.tasks.start_tasks import augur_collection_monitor + from augur.tasks.start_tasks import non_repo_domain_tasks, retry_errored_repos, create_collection_status_records from augur.tasks.git.facade_tasks import clone_repos + from augur.tasks.github.contributors import process_contributors from augur.tasks.db.refresh_materialized_views import refresh_materialized_views from augur.tasks.data_analysis.contributor_breadth_worker.contributor_breadth_worker import contributor_breadth_model from augur.application.db import temporary_database_engine @@ -217,23 +219,25 @@ def setup_periodic_tasks(sender, **kwargs): sender.add_periodic_task(collection_interval, augur_collection_monitor.s()) #Do longer tasks less often - non_domain_collection_interval = collection_interval * 300 - logger.info(f"Scheduling non-repo-domain collection every {non_domain_collection_interval/60} minutes") - sender.add_periodic_task(non_domain_collection_interval, non_repo_domain_tasks.s()) + logger.info(f"Scheduling data analysis every 30 days") + thirty_days_in_seconds = 30*24*60*60 + sender.add_periodic_task(thirty_days_in_seconds, non_repo_domain_tasks.s()) mat_views_interval = int(config.get_value('Celery', 'refresh_materialized_views_interval_in_days')) logger.info(f"Scheduling refresh materialized view every night at 1am CDT") sender.add_periodic_task(datetime.timedelta(days=mat_views_interval), refresh_materialized_views.s()) - logger.info(f"Scheduling update of collection weights on midnight each day") - sender.add_periodic_task(crontab(hour=0, minute=0),augur_collection_update_weights.s()) + # logger.info(f"Scheduling update of collection weights on midnight each day") + # sender.add_periodic_task(crontab(hour=0, minute=0),augur_collection_update_weights.s()) logger.info(f"Setting 404 repos to be marked for retry on midnight each day") sender.add_periodic_task(crontab(hour=0, minute=0),retry_errored_repos.s()) - logger.info(f"Scheduling contributor breadth every 30 days") - thirty_days_in_seconds = 30*24*60*60 - sender.add_periodic_task(thirty_days_in_seconds, contributor_breadth_model.s()) + one_hour_in_seconds = 60*60 + sender.add_periodic_task(one_hour_in_seconds, process_contributors.s()) + + one_day_in_seconds = 24*60*60 + sender.add_periodic_task(one_day_in_seconds, create_collection_status_records.s()) @after_setup_logger.connect def setup_loggers(*args,**kwargs): diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index b4116dd7da..3a61e391a8 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -14,10 +14,11 @@ from augur.tasks.github.repo_info.tasks import collect_repo_info, collect_linux_badge_info from augur.tasks.github.pull_requests.files_model.tasks import process_pull_request_files from augur.tasks.github.pull_requests.commits_model.tasks import process_pull_request_commits +from augur.tasks.github.util.populate_repo_src_id import populate_repo_src_id_task from augur.tasks.git.dependency_tasks.tasks import process_ossf_dependency_metrics -from augur.tasks.github.traffic.tasks import collect_github_repo_clones_data -from augur.tasks.gitlab.merge_request_task import collect_gitlab_merge_requests, collect_merge_request_metadata, collect_merge_request_commits, collect_merge_request_files -from augur.tasks.gitlab.issues_task import collect_gitlab_issues +from augur.tasks.github.traffic import collect_github_repo_clones_data +from augur.tasks.gitlab.merge_request_task import collect_gitlab_merge_requests, collect_merge_request_metadata, collect_merge_request_commits, collect_merge_request_files, collect_merge_request_comments +from augur.tasks.gitlab.issues_task import collect_gitlab_issues, collect_gitlab_issue_comments from augur.tasks.gitlab.events_task import collect_gitlab_issue_events, collect_gitlab_merge_request_events from augur.tasks.git.facade_tasks import * from augur.tasks.db.refresh_materialized_views import * @@ -27,6 +28,9 @@ from augur.tasks.util.collection_state import CollectionState from augur.tasks.util.collection_util import * from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_facade_weight_time_factor +from augur.application.db.lib import execute_sql, get_session + +RUNNING_DOCKER = os.environ.get('AUGUR_DOCKER_DEPLOY') == "1" CELERY_GROUP_TYPE = type(group()) CELERY_CHAIN_TYPE = type(chain()) @@ -42,13 +46,13 @@ """ #Prelim phases are used to detect if where the repo has hosted has moved or not. -def prelim_phase(repo_git): +def prelim_phase(repo_git, full_collection): logger = logging.getLogger(prelim_phase.__name__) return detect_github_repo_move_core.si(repo_git) -def prelim_phase_secondary(repo_git): +def prelim_phase_secondary(repo_git, full_collection): logger = logging.getLogger(prelim_phase.__name__) return detect_github_repo_move_secondary.si(repo_git) @@ -56,27 +60,28 @@ def prelim_phase_secondary(repo_git): #This is the phase that defines the message for core augur collection #A chain is needed for each repo. -def primary_repo_collect_phase(repo_git): +def primary_repo_collect_phase(repo_git, full_collection): logger = logging.getLogger(primary_repo_collect_phase.__name__) #Define primary group of jobs for the primary collect phase: issues and pull requests. primary_repo_jobs = group( - collect_issues.si(repo_git), - collect_pull_requests.si(repo_git) + populate_repo_src_id_task.si(repo_git), + collect_issues.si(repo_git, full_collection), + collect_pull_requests.si(repo_git, full_collection) ) #Define secondary group that can't run until after primary jobs have finished. secondary_repo_jobs = group( - collect_events.si(repo_git),#*create_grouped_task_load(dataList=first_pass, task=collect_events).tasks, - collect_github_messages.si(repo_git), #*create_grouped_task_load(dataList=first_pass,task=collect_github_messages).tasks, + collect_events.si(repo_git, full_collection),#*create_grouped_task_load(dataList=first_pass, task=collect_events).tasks, + collect_github_messages.si(repo_git, full_collection), #*create_grouped_task_load(dataList=first_pass,task=collect_github_messages).tasks, collect_github_repo_clones_data.si(repo_git), ) #Other tasks that don't need other tasks to run before they do just put in final group. repo_task_group = group( collect_repo_info.si(repo_git), - chain(primary_repo_jobs | issue_pr_task_update_weight_util.s(repo_git=repo_git),secondary_repo_jobs,process_contributors.si()), + chain(primary_repo_jobs | issue_pr_task_update_weight_util.s(repo_git=repo_git),secondary_repo_jobs), #facade_phase(logger,repo_git), collect_linux_badge_info.si(repo_git), collect_releases.si(repo_git), @@ -85,13 +90,13 @@ def primary_repo_collect_phase(repo_git): return repo_task_group -def primary_repo_collect_phase_gitlab(repo_git): +def primary_repo_collect_phase_gitlab(repo_git, full_collection): logger = logging.getLogger(primary_repo_collect_phase_gitlab.__name__) jobs = group( chain(collect_gitlab_merge_requests.si(repo_git), group( - #collect_merge_request_comments.s(repo_git), + collect_merge_request_comments.s(repo_git), #collect_merge_request_reviewers.s(repo_git), collect_merge_request_metadata.s(repo_git), collect_merge_request_commits.s(repo_git), @@ -99,7 +104,7 @@ def primary_repo_collect_phase_gitlab(repo_git): collect_gitlab_merge_request_events.si(repo_git), )), chain(collect_gitlab_issues.si(repo_git), group( - #collect_gitlab_issue_comments.s(repo_git), + collect_gitlab_issue_comments.s(repo_git), collect_gitlab_issue_events.si(repo_git), )), ) @@ -109,13 +114,13 @@ def primary_repo_collect_phase_gitlab(repo_git): #This phase creates the message for secondary collection tasks. #These are less important and have their own worker. -def secondary_repo_collect_phase(repo_git): +def secondary_repo_collect_phase(repo_git, full_collection): logger = logging.getLogger(secondary_repo_collect_phase.__name__) repo_task_group = group( - process_pull_request_files.si(repo_git), - process_pull_request_commits.si(repo_git), - chain(collect_pull_request_reviews.si(repo_git), collect_pull_request_review_comments.si(repo_git)), + process_pull_request_files.si(repo_git, full_collection), + process_pull_request_commits.si(repo_git, full_collection), + chain(collect_pull_request_reviews.si(repo_git, full_collection), collect_pull_request_review_comments.si(repo_git, full_collection)), process_ossf_dependency_metrics.si(repo_git) ) @@ -139,9 +144,7 @@ def non_repo_domain_tasks(self): enabled_tasks = [] - enabled_tasks.extend(generate_non_repo_domain_facade_tasks(logger)) - - if machine_learning_phase.__name__ in enabled_phase_names: + if not RUNNING_DOCKER and machine_learning_phase.__name__ in enabled_phase_names: #enabled_tasks.extend(machine_learning_phase()) from augur.tasks.data_analysis.contributor_breadth_worker.contributor_breadth_worker import contributor_breadth_model enabled_tasks.append(contributor_breadth_model.si()) @@ -153,7 +156,7 @@ def non_repo_domain_tasks(self): tasks.apply_async() -def build_primary_repo_collect_request(session,enabled_phase_names, days_until_collect_again = 1): +def build_primary_repo_collect_request(session, logger, enabled_phase_names, days_until_collect_again = 15): #Add all required tasks to a list and pass it to the CollectionRequest primary_enabled_phases = [] primary_gitlab_enabled_phases = [] @@ -166,17 +169,17 @@ def build_primary_repo_collect_request(session,enabled_phase_names, days_until_c primary_gitlab_enabled_phases.append(primary_repo_collect_phase_gitlab) #task success is scheduled no matter what the config says. - def core_task_success_util_gen(repo_git): + def core_task_success_util_gen(repo_git, full_collection): return core_task_success_util.si(repo_git) primary_enabled_phases.append(core_task_success_util_gen) primary_gitlab_enabled_phases.append(core_task_success_util_gen) - primary_request = CollectionRequest("core",primary_enabled_phases,max_repo=40, days_until_collect_again=7, gitlab_phases=primary_gitlab_enabled_phases) + primary_request = CollectionRequest("core",primary_enabled_phases,max_repo=40, days_until_collect_again=15, gitlab_phases=primary_gitlab_enabled_phases) primary_request.get_valid_repos(session) return primary_request -def build_secondary_repo_collect_request(session,enabled_phase_names, days_until_collect_again = 1): +def build_secondary_repo_collect_request(session, logger, enabled_phase_names, days_until_collect_again = 1): #Deal with secondary collection secondary_enabled_phases = [] @@ -186,48 +189,48 @@ def build_secondary_repo_collect_request(session,enabled_phase_names, days_until secondary_enabled_phases.append(secondary_repo_collect_phase) - def secondary_task_success_util_gen(repo_git): + def secondary_task_success_util_gen(repo_git, full_collection): return secondary_task_success_util.si(repo_git) secondary_enabled_phases.append(secondary_task_success_util_gen) - request = CollectionRequest("secondary",secondary_enabled_phases,max_repo=10, days_until_collect_again=10) + request = CollectionRequest("secondary",secondary_enabled_phases,max_repo=60, days_until_collect_again=10) request.get_valid_repos(session) return request -def build_facade_repo_collect_request(session,enabled_phase_names, days_until_collect_again = 1): +def build_facade_repo_collect_request(session, logger, enabled_phase_names, days_until_collect_again = 10): #Deal with facade collection facade_enabled_phases = [] facade_enabled_phases.append(facade_phase) - def facade_task_success_util_gen(repo_git): + def facade_task_success_util_gen(repo_git, full_collection): return facade_task_success_util.si(repo_git) facade_enabled_phases.append(facade_task_success_util_gen) - def facade_task_update_weight_util_gen(repo_git): + def facade_task_update_weight_util_gen(repo_git, full_collection): return git_update_commit_count_weight.si(repo_git) facade_enabled_phases.append(facade_task_update_weight_util_gen) - request = CollectionRequest("facade",facade_enabled_phases,max_repo=30, days_until_collect_again=7) + request = CollectionRequest("facade",facade_enabled_phases,max_repo=30, days_until_collect_again=10) request.get_valid_repos(session) return request -def build_ml_repo_collect_request(session,enabled_phase_names, days_until_collect_again = 1): +def build_ml_repo_collect_request(session, logger, enabled_phase_names, days_until_collect_again = 40): ml_enabled_phases = [] ml_enabled_phases.append(machine_learning_phase) - def ml_task_success_util_gen(repo_git): + def ml_task_success_util_gen(repo_git, full_collection): return ml_task_success_util.si(repo_git) ml_enabled_phases.append(ml_task_success_util_gen) - request = CollectionRequest("ml",ml_enabled_phases,max_repo=5, days_until_collect_again=10) + request = CollectionRequest("ml",ml_enabled_phases,max_repo=5, days_until_collect_again=40) request.get_valid_repos(session) return request @@ -240,29 +243,32 @@ def augur_collection_monitor(self): logger.info("Checking for repos to collect") - with DatabaseSession(logger, engine) as session: - #Get list of enabled phases - enabled_phase_names = get_enabled_phase_names_from_config() + + #Get list of enabled phases + enabled_phase_names = get_enabled_phase_names_from_config() + + enabled_collection_hooks = [] - enabled_collection_hooks = [] + with DatabaseSession(logger, self.app.engine) as session: if primary_repo_collect_phase.__name__ in enabled_phase_names: - enabled_collection_hooks.append(build_primary_repo_collect_request(session,enabled_phase_names)) + enabled_collection_hooks.append(build_primary_repo_collect_request(session, logger, enabled_phase_names)) if secondary_repo_collect_phase.__name__ in enabled_phase_names: - enabled_collection_hooks.append(build_secondary_repo_collect_request(session,enabled_phase_names)) + enabled_collection_hooks.append(build_secondary_repo_collect_request(session, logger, enabled_phase_names)) #start_secondary_collection(session, max_repo=10) if facade_phase.__name__ in enabled_phase_names: #start_facade_collection(session, max_repo=30) - enabled_collection_hooks.append(build_facade_repo_collect_request(session,enabled_phase_names)) + enabled_collection_hooks.append(build_facade_repo_collect_request(session, logger, enabled_phase_names)) - if machine_learning_phase.__name__ in enabled_phase_names: - enabled_collection_hooks.append(build_ml_repo_collect_request(session,enabled_phase_names)) + if not RUNNING_DOCKER and machine_learning_phase.__name__ in enabled_phase_names: + enabled_collection_hooks.append(build_ml_repo_collect_request(session, logger, enabled_phase_names)) #start_ml_collection(session,max_repo=5) logger.info(f"Starting collection phases: {[h.name for h in enabled_collection_hooks]}") - main_routine = AugurTaskRoutine(session,enabled_collection_hooks) + + main_routine = AugurTaskRoutine(logger, enabled_collection_hooks) main_routine.start_data_collection() @@ -278,7 +284,7 @@ def augur_collection_update_weights(self): logger.info("Updating stale collection weights") - with DatabaseSession(logger,engine) as session: + with get_session() as session: core_weight_update_repos = session.query(CollectionStatus).filter(CollectionStatus.core_weight != None).all() @@ -301,7 +307,7 @@ def augur_collection_update_weights(self): repo = Repo.get_by_id(session, status.repo_id) commit_count = status.commit_sum - date_factor = get_facade_weight_time_factor(session, repo.repo_git) + date_factor = get_facade_weight_time_factor(repo.repo_git) weight = commit_count - date_factor update_query = ( @@ -324,18 +330,26 @@ def retry_errored_repos(self): #TODO: Isaac needs to normalize the status's to be abstract in the #collection_status table once augur dev is less unstable. - with DatabaseSession(logger,engine) as session: - query = s.sql.text(f"""UPDATE collection_status SET secondary_status = '{CollectionState.PENDING.value}'""" - f""" WHERE secondary_status = '{CollectionState.ERROR.value}' ;""" - f"""UPDATE collection_status SET core_status = '{CollectionState.PENDING.value}'""" - f""" WHERE core_status = '{CollectionState.ERROR.value}' ;""" - f"""UPDATE collection_status SET facade_status = '{CollectionState.PENDING.value}'""" - f""" WHERE facade_status = '{CollectionState.ERROR.value}' ;""" - f"""UPDATE collection_status SET ml_status = '{CollectionState.PENDING.value}'""" - f""" WHERE ml_status = '{CollectionState.ERROR.value}' ;""" - ) + query = s.sql.text(f"""UPDATE collection_status SET secondary_status = '{CollectionState.PENDING.value}'""" + f""" WHERE secondary_status = '{CollectionState.ERROR.value}' and secondary_data_last_collected is NULL;""" + f"""UPDATE collection_status SET core_status = '{CollectionState.PENDING.value}'""" + f""" WHERE core_status = '{CollectionState.ERROR.value}' and core_data_last_collected is NULL;""" + f"""UPDATE collection_status SET facade_status = '{CollectionState.PENDING.value}'""" + f""" WHERE facade_status = '{CollectionState.ERROR.value}' and facade_data_last_collected is NULL;""" + f"""UPDATE collection_status SET ml_status = '{CollectionState.PENDING.value}'""" + f""" WHERE ml_status = '{CollectionState.ERROR.value}' and ml_data_last_collected is NULL;""" + + f"""UPDATE collection_status SET secondary_status = '{CollectionState.SUCCESS.value}'""" + f""" WHERE secondary_status = '{CollectionState.ERROR.value}' and secondary_data_last_collected is not NULL;""" + f"""UPDATE collection_status SET core_status = '{CollectionState.SUCCESS.value}'""" + f""" WHERE core_status = '{CollectionState.ERROR.value}' and core_data_last_collected is not NULL;;""" + f"""UPDATE collection_status SET facade_status = '{CollectionState.SUCCESS.value}'""" + f""" WHERE facade_status = '{CollectionState.ERROR.value}' and facade_data_last_collected is not NULL;;""" + f"""UPDATE collection_status SET ml_status = '{CollectionState.SUCCESS.value}'""" + f""" WHERE ml_status = '{CollectionState.ERROR.value}' and ml_data_last_collected is not NULL;;""" + ) - session.execute_sql(query) + execute_sql(query) @@ -352,16 +366,17 @@ def create_collection_status_records(self): engine = self.app.engine logger = logging.getLogger(create_collection_status_records.__name__) - with DatabaseSession(logger,engine) as session: - query = s.sql.text(""" - SELECT repo_id FROM repo WHERE repo_id NOT IN (SELECT repo_id FROM augur_operations.collection_status) - """) + query = s.sql.text(""" + SELECT repo_id FROM repo WHERE repo_id NOT IN (SELECT repo_id FROM augur_operations.collection_status) + """) + + repo = execute_sql(query).first() - repo = session.execute_sql(query).first() + with DatabaseSession(logger) as session: while repo is not None: - CollectionStatus.insert(session,repo[0]) - repo = session.execute_sql(query).first() - - #Check for new repos every seven minutes to be out of step with the clone_repos task - create_collection_status_records.si().apply_async(countdown=60*7) + CollectionStatus.insert(session, logger, repo[0]) + repo = execute_sql(query).first() + + # no longer recursively run this task because collection status records are added when repos are inserted + #create_collection_status_records.si().apply_async(countdown=60*7) diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index 3561b19b40..bed73bd120 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -12,13 +12,13 @@ from augur.application.db.util import execute_session_query from augur.application.db.lib import get_section from augur.tasks.github.util.util import get_repo_weight_core, get_repo_weight_by_issue -from augur.application.db.session import DatabaseSession from augur.application.db import get_engine +from augur.application.db.lib import execute_sql, get_session, get_active_repo_count, get_repo_by_repo_git from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps from augur.tasks.util.collection_state import CollectionState -def get_list_of_all_users(session): +def get_list_of_all_users(): #Get a list of all users. query = s.sql.text(""" SELECT @@ -26,7 +26,7 @@ def get_list_of_all_users(session): FROM augur_operations.users """) - users = session.execute_sql(query).fetchall() + users = execute_sql(query).fetchall() return users @@ -129,61 +129,79 @@ def __init__(self,name,phases,max_repo = 10,days_until_collect_again = 1, gitlab if name == "facade": self.new_status = CollectionState.UPDATE.value - def get_active_repo_count(self,session): - return len(session.query(CollectionStatus).filter(getattr(CollectionStatus,f"{self.name}_status" ) == CollectionState.COLLECTING.value).all()) - - #Get repo urls based on passed in info. def get_valid_repos(self,session): - #getattr(CollectionStatus,f"{hook}_status" ) represents the status of the given hook - #Get the count of repos that are currently running this collection hook - #status_column = f"{hook}_status" - active_repo_count = self.get_active_repo_count(session) - - #Will always disallow errored repos and repos that are already collecting - #The maximum amount of repos to schedule is affected by the existing repos running tasks + active_repo_count = get_active_repo_count(self.name) limit = self.max_repo-active_repo_count - #Extract the user id from the randomized list and split into four chunks - split_user_list = split_random_users_list(session,f"{self.name}_status",self.new_status) + if limit <= 0: + return - session.logger.info(f"User_list: {split_user_list}") + new_collection_git_list = get_newly_added_repos(session, limit, hook=self.name) + collection_list = [(repo_git, True) for repo_git in new_collection_git_list] + self.repo_list.extend(collection_list) + limit -= len(collection_list) - #Iterate through each fourth of the users fetched - for quarter_list in split_user_list: - if limit <= 0: - return + #Now start recollecting other repos if there is space to do so. + if limit <= 0: + return - collection_list = get_valid_repos_for_users(session,limit,tuple(quarter_list),hook=self.name, days_to_wait_until_next_collection=self.days_until_collect_again) + recollection_git_list = get_repos_for_recollection(session, limit, hook=self.name, days_until_collect_again=self.days_until_collect_again) + collection_list = [(repo_git, False) for repo_git in recollection_git_list] + self.repo_list.extend(collection_list) - self.repo_list.extend(collection_list) - #Update limit with amount of repos started - limit -= len(collection_list) - #Now start old repos if there is space to do so. - if limit <= 0: - return +def get_newly_added_repos(session, limit, hook): + + condition_string = "" + order_by_field = "" + if hook in ["core", "secondary", "ml"]: + condition_string += f"""{hook}_status='{str(CollectionState.PENDING.value)}'""" + order_by_field = "issue_pr_sum" + + elif hook == "facade": + condition_string += f"""facade_status='{str(CollectionState.UPDATE.value)}'""" + order_by_field = "commit_sum" + if hook == "secondary": + condition_string += f""" and core_status='{str(CollectionState.SUCCESS.value)}'""" - user_list = get_list_of_all_users(session) - random.shuffle(user_list) + repo_query = s.sql.text(f""" + select repo_git + from augur_operations.collection_status x, augur_data.repo y + where x.repo_id=y.repo_id + and {condition_string} + order by {order_by_field} + limit :limit_num + """).bindparams(limit_num=limit) + + valid_repos = session.execute_sql(repo_query).fetchall() + valid_repo_git_list = [repo[0] for repo in valid_repos] - #Extract the user id from the randomized list and split into four chunks - split_user_list = split_list_into_chunks([row[0] for row in user_list], 4) + return valid_repo_git_list + +def get_repos_for_recollection(session, limit, hook, days_until_collect_again): - for quarter_list in split_user_list: + if hook in ["core", "secondary", "ml"]: + condition_string = f"""{hook}_status='{str(CollectionState.SUCCESS.value)}'""" - #Break out if limit has been reached - if limit <= 0: - return + elif hook == "facade": + condition_string = f"""facade_status='{str(CollectionState.SUCCESS.value)}'""" - #only start repos older than the specified amount of days - #Query a set of valid repositories sorted by weight, also making sure that the repos aren't new or errored - #Order by the relevant weight for the collection hook - collection_list = get_valid_repos_for_users(session,limit,tuple(quarter_list),allow_old_repos=True,hook=self.name, days_to_wait_until_next_collection=self.days_until_collect_again) + repo_query = s.sql.text(f""" + select repo_git + from augur_operations.collection_status x, repo y + where x.repo_id = y.repo_id + and {condition_string} + and {hook}_data_last_collected <= NOW() - INTERVAL '{days_until_collect_again} DAYS' + order by {hook}_data_last_collected + limit :limit_num + """).bindparams(limit_num=limit) - self.repo_list.extend(collection_list) - limit -= len(collection_list) + valid_repos = session.execute_sql(repo_query).fetchall() + valid_repo_git_list = [repo[0] for repo in valid_repos] + + return valid_repo_git_list def get_enabled_phase_names_from_config(): @@ -224,7 +242,7 @@ def task_failed_util(self, request,exc,traceback): # log traceback to error file logger.error(f"Task {request.id} raised exception: {exc}\n{traceback}") - with DatabaseSession(logger,engine) as session: + with get_session() as session: core_id_match = CollectionStatus.core_task_id == request.id secondary_id_match = CollectionStatus.secondary_task_id == request.id facade_id_match = CollectionStatus.facade_task_id == request.id @@ -283,7 +301,7 @@ def issue_pr_task_update_weight_util(self, issue_and_pr_nums,repo_git=None,sessi if session is not None: update_issue_pr_weights(logger, session, repo_git, sum(issue_and_pr_nums)) else: - with DatabaseSession(logger,engine=engine) as session: + with get_session() as session: update_issue_pr_weights(logger,session,repo_git,sum(issue_and_pr_nums)) @@ -296,7 +314,7 @@ def core_task_success_util(self, repo_git): logger.info(f"Repo '{repo_git}' succeeded through core collection") - with DatabaseSession(logger, engine) as session: + with get_session() as session: repo = Repo.get_by_repo_git(session, repo_git) if not repo: @@ -363,7 +381,7 @@ def secondary_task_success_util(self, repo_git): logger.info(f"Repo '{repo_git}' succeeded through secondary collection") - with DatabaseSession(logger, engine) as session: + with get_session() as session: repo = Repo.get_by_repo_git(session, repo_git) if not repo: @@ -389,7 +407,7 @@ def get_repo_weight_secondary(logger,repo_git): engine = get_engine() - with DatabaseSession(logger,engine) as session: + with get_session() as session: repo = Repo.get_by_repo_git(session, repo_git) if not repo: raise Exception(f"Task with repo_git of {repo_git} but could not be found in Repo table") @@ -416,7 +434,7 @@ def facade_task_success_util(self, repo_git): logger.info(f"Repo '{repo_git}' succeeded through facade task collection") - with DatabaseSession(logger, engine) as session: + with get_session() as session: repo = Repo.get_by_repo_git(session, repo_git) if not repo: @@ -439,7 +457,7 @@ def ml_task_success_util(self, repo_git): logger.info(f"Repo '{repo_git}' succeeded through machine learning task collection") - with DatabaseSession(logger, engine) as session: + with get_session() as session: repo = Repo.get_by_repo_git(session, repo_git) if not repo: @@ -464,7 +482,7 @@ def facade_clone_success_util(self, repo_git): logger.info(f"Repo '{repo_git}' succeeded through facade update/clone") - with DatabaseSession(logger, engine) as session: + with get_session() as session: repo = Repo.get_by_repo_git(session, repo_git) if not repo: @@ -535,21 +553,21 @@ class to keep track of various groups of collection tasks for a group of repos. collection_hook (str): String determining the attributes to update when collection for a repo starts. e.g. core session: Database session to use """ - def __init__(self,session,collection_hooks): - self.logger = session.logger + def __init__(self, logger,collection_hooks): + self.logger = logger self.collection_hooks = collection_hooks - self.session = session - def update_status_and_id(self,repo_git, task_id, name): - repo = self.session.query(Repo).filter(Repo.repo_git == repo_git).one() + def update_status_and_id(self,repo_git, task_id, name, session): + # NOTE: Can't simply replace with lib method because it is doing .collection_status[0] afterwards + repo = session.query(Repo).filter(Repo.repo_git == repo_git).one() #Set status in database to collecting repoStatus = repo.collection_status[0] # setattr(repoStatus,f"{name}_task_id",task_id) setattr(repoStatus,f"{name}_status", CollectionState.COLLECTING.value) - self.session.commit() + session.commit() def start_data_collection(self): @@ -563,8 +581,11 @@ def start_data_collection(self): #Send messages starts each repo and yields its running info #to concurrently update the correct field in the database. - for repo_git, task_id, hook_name in self.send_messages(): - self.update_status_and_id(repo_git,task_id,hook_name) + + with get_session() as session: + + for repo_git, task_id, hook_name in self.send_messages(): + self.update_status_and_id(repo_git,task_id,hook_name, session) def send_messages(self): augur_collection_list = [] @@ -572,16 +593,16 @@ def send_messages(self): for col_hook in self.collection_hooks: self.logger.info(f"Starting collection on {len(col_hook.repo_list)} {col_hook.name} repos") - - for repo_git in col_hook.repo_list: - repo = self.session.query(Repo).filter(Repo.repo_git == repo_git).one() + for repo_git, full_collection in col_hook.repo_list: + + repo = get_repo_by_repo_git(repo_git) if "github" in repo.repo_git: augur_collection_sequence = [] for job in col_hook.phases: #Add the phase to the sequence in order as a celery task. #The preliminary task creates the larger task chain - augur_collection_sequence.append(job(repo_git)) + augur_collection_sequence.append(job(repo_git, full_collection)) #augur_collection_sequence.append(core_task_success_util.si(repo_git)) #Link all phases in a chain and send to celery @@ -599,7 +620,7 @@ def send_messages(self): for job in col_hook.gitlab_phases: #Add the phase to the sequence in order as a celery task. #The preliminary task creates the larger task chain - augur_collection_sequence.append(job(repo_git)) + augur_collection_sequence.append(job(repo_git, full_collection)) #augur_collection_sequence.append(core_task_success_util.si(repo_git)) #Link all phases in a chain and send to celery @@ -610,80 +631,3 @@ def send_messages(self): #yield the value of the task_id to the calling method so that the proper collectionStatus field can be updated yield repo_git, task_id, col_hook.name - -#def start_block_of_repos(logger,session,repo_git_identifiers,phases,repos_type,hook="core"): -# -# logger.info(f"Starting collection on {len(repo_git_identifiers)} {repos_type} {hook} repos") -# if len(repo_git_identifiers) == 0: -# return 0 -# -# logger.info(f"Collection starting for {hook}: {tuple(repo_git_identifiers)}") -# -# routine = AugurTaskRoutine(session,repos=repo_git_identifiers,collection_phases=phases,collection_hook=hook) -# -# routine.start_data_collection() -# -# return len(repo_git_identifiers) - -def get_valid_repos_for_users(session,limit,users,allow_old_repos = False,hook="core",days_to_wait_until_next_collection = 1): - - condition_string = "1" - - if hook == "core": - condition_string = get_required_conditions_for_core_repos(allow_collected_before=allow_old_repos,days_until_collect_again= days_to_wait_until_next_collection) - elif hook == "secondary": - condition_string = get_required_conditions_for_secondary_repos(allow_collected_before=allow_old_repos,days_until_collect_again = days_to_wait_until_next_collection) - elif hook == "facade": - condition_string = get_required_conditions_for_facade_repos(allow_collected_before=allow_old_repos,days_until_collect_again = days_to_wait_until_next_collection) - elif hook == "ml": - condition_string = get_required_conditions_for_ml_repos(allow_collected_before=allow_old_repos,days_until_collect_again = days_to_wait_until_next_collection) - - #Query a set of valid repositories sorted by weight, also making sure that the repos are new - #Order by the relevant weight for the collection hook - repo_query = s.sql.text(f""" - SELECT DISTINCT repo.repo_id, repo.repo_git, collection_status.{hook}_weight - FROM augur_operations.user_groups - JOIN augur_operations.user_repos ON augur_operations.user_groups.group_id = augur_operations.user_repos.group_id - JOIN augur_data.repo ON augur_operations.user_repos.repo_id = augur_data.repo.repo_id - JOIN augur_operations.collection_status ON augur_operations.user_repos.repo_id = augur_operations.collection_status.repo_id - WHERE user_id IN :list_of_user_ids AND {condition_string} - ORDER BY augur_operations.collection_status.{hook}_weight - LIMIT :limit_num - """).bindparams(list_of_user_ids=users,limit_num=limit) - - #Get a list of valid repo ids, limit set to 2 times the usual - valid_repos = session.execute_sql(repo_query).fetchall() - valid_repo_git_list = [repo[1] for repo in valid_repos] - - session.logger.info(f"valid repo git list: {tuple(valid_repo_git_list)}") - - #start repos for new primary collection hook - #collection_size = start_block_of_repos( - # session.logger, session, - # valid_repo_git_list, - # phases, repos_type=repos_type, hook=hook - #) - - return valid_repo_git_list - -def split_random_users_list(session,status_col, status_new): - #Split all users that have new repos into four lists and randomize order - query = s.sql.text(f""" - SELECT - user_id - FROM augur_operations.user_groups - JOIN augur_operations.user_repos ON augur_operations.user_groups.group_id = augur_operations.user_repos.group_id - JOIN augur_data.repo ON augur_operations.user_repos.repo_id = augur_data.repo.repo_id - JOIN augur_operations.collection_status ON augur_operations.user_repos.repo_id = augur_operations.collection_status.repo_id - WHERE {status_col}='{str(status_new)}' - GROUP BY user_id - """) - - user_list = session.execute_sql(query).fetchall() - random.shuffle(user_list) - - #Extract the user id from the randomized list and split into four chunks - split_user_list = split_list_into_chunks([row[0] for row in user_list], 4) - - return split_user_list - diff --git a/augur/tasks/util/metadata_exception.py b/augur/tasks/util/metadata_exception.py new file mode 100644 index 0000000000..a861badac9 --- /dev/null +++ b/augur/tasks/util/metadata_exception.py @@ -0,0 +1,6 @@ +class MetadataException(Exception): + def __init__(self, original_exception, additional_metadata): + self.original_exception = original_exception + self.additional_metadata = additional_metadata + + super().__init__(f"{str(self.original_exception)} | Additional metadata: {self.additional_metadata}") diff --git a/augur/tasks/util/random_key_auth.py b/augur/tasks/util/random_key_auth.py index 7f7bd65557..f2fea35b36 100644 --- a/augur/tasks/util/random_key_auth.py +++ b/augur/tasks/util/random_key_auth.py @@ -33,7 +33,7 @@ def auth_flow(self, request: Request) -> Generator[Request, Response, None]: if self.list_of_keys: key_value = choice(self.list_of_keys) - self.logger.debug(f'Key value used: {key_value}') + self.logger.debug(f'Key value used in request: {key_value}') # formats the key string into a format GitHub will accept if self.key_format: diff --git a/augur/tasks/util/worker_util.py b/augur/tasks/util/worker_util.py index 6198f1ccdb..5ec2e6eebc 100644 --- a/augur/tasks/util/worker_util.py +++ b/augur/tasks/util/worker_util.py @@ -11,6 +11,9 @@ import json import subprocess +from augur.tasks.util.metadata_exception import MetadataException + + def create_grouped_task_load(*args,processes=8,dataList=[],task=None): if not dataList or not task: @@ -135,10 +138,13 @@ def parse_json_from_subprocess_call(logger, subprocess_arr, cwd=None): output = p.stdout try: - required_output = json.loads(output) + if output and output.strip(): + required_output = json.loads(output) + else: + required_output = {} except json.decoder.JSONDecodeError as e: logger.error(f"Could not parse required output! \n output: {output} \n Error: {e}") - raise e + raise MetadataException(e, f"output : {output}") return required_output diff --git a/augur/templates/first-time-config.j2 b/augur/templates/first-time-config.j2 new file mode 100644 index 0000000000..c9c2106d6c --- /dev/null +++ b/augur/templates/first-time-config.j2 @@ -0,0 +1,287 @@ +{# https://www.bootdey.com/snippets/view/dark-profile-settings #} + + + + + + + + + + + + + + + + +
+
+ {# Start sidebar #} +
+
+
+ +
+ +
+
+ {# Start form body #} +
+
+
+
+
+
+
{{ essential_config.title }}
+ {{ essential_config.subtitle }} +
+ {% for setting in essential_config.settings %} +
+
+ + +
{{ setting.description }}
+
+
+ {% endfor %} +
+
+
+ + +
+
+
+ {% if not sections %} +
+
+
Full Database Config
+
+
+
+

No database config exists

+
+
+
+ {% else %} +
+
+
Full Database Config
+
+ {% for section in sections %} +
+
+ {% if section.subtitle %} +
{{ section.title }}
+ {{ section.subtitle }} + {% else %} +
{{ section.title }}
+ {% endif %} +
+ {% for setting in section.settings %} +
+
+ + +
{{ setting.description }}
+
+
+ {% endfor %} +
+ {% endfor %} +
+
+ +
+
+
+ {% endif %} +
+
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/augur/templates/first-time-key.j2 b/augur/templates/first-time-key.j2 new file mode 100644 index 0000000000..372410855f --- /dev/null +++ b/augur/templates/first-time-key.j2 @@ -0,0 +1,41 @@ + + + + + + + + + + + + + + + + +
+
+

Enter setup key

+
+ +
+ +
+
+ \ No newline at end of file diff --git a/augur/templates/first-time.j2 b/augur/templates/first-time.j2 index c8eb284da8..7a9dbccf21 100644 --- a/augur/templates/first-time.j2 +++ b/augur/templates/first-time.j2 @@ -1,18 +1,23 @@ {# https://www.bootdey.com/snippets/view/dark-profile-settings #} + - + + - +
{# Start sidebar #} @@ -29,14 +34,17 @@

First Time Setup

Take a moment to create or update the configuration for your instance.


-

Default values are shown. When you are done updating, click the restart button to save the settings and bring the primary server up.

+

Default values are shown. When you are done updating, click the continue button to + continue to the primary configuration setup

+

Double-click an empty input field to automatically populate it with the placeholder + value

@@ -101,111 +145,156 @@ -