Skip to content

Commit 181d1cc

Browse files
committed
feat: introduce crawler agent and orders system (DAP-4811) (#56)
* feat: introduce crawler agent PoC * ci: containerize crawler * build: add kubernetes deployment config * feat: process multiple page in parallel * ci: ignore crawler subprojects * feat: introduce orders * refactor: extract shared types to crawler sdk * feat(crawler-agent): fetch orders from the backend * feat: fetch parser configs from the registry * feat(crawler-agent): respect cron expressions
1 parent 7ea4787 commit 181d1cc

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

66 files changed

+2655
-167
lines changed
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
name: Crawler Agent
2+
3+
on:
4+
push:
5+
branches:
6+
- develop
7+
paths:
8+
- 'apps/crawler-agent/**'
9+
workflow_dispatch:
10+
11+
jobs:
12+
build:
13+
name: Build and Push container
14+
runs-on: ubuntu-latest
15+
permissions:
16+
contents: read
17+
packages: write
18+
steps:
19+
- name: "Checkout GitHub Action"
20+
uses: actions/checkout@main
21+
22+
- name: "Login to GitHub Container Registry"
23+
uses: docker/login-action@v1
24+
with:
25+
registry: ghcr.io
26+
username: ${{github.actor}}
27+
password: ${{secrets.GITHUB_TOKEN}}
28+
29+
- name: "Build Inventory Image"
30+
run: |
31+
docker build -f apps/crawler-agent/Dockerfile . --tag ghcr.io/dapplets/crawler-agent:latest
32+
docker push ghcr.io/dapplets/crawler-agent:latest

.github/workflows/extension-artifact-upload.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,11 @@ name: Extension Artifact Upload
22
on:
33
pull_request:
44
branches: [develop]
5+
paths-ignore:
6+
- 'apps/crawler-agent/**'
7+
- 'apps/crawler-backend/**'
8+
- 'apps/crawler-dashboard/**'
9+
- 'apps/crawler-extension/**'
510
jobs:
611
create-artifact:
712
name: Test build

.github/workflows/extension-release.yml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,12 @@ on:
66
- master
77
- develop
88
paths-ignore:
9+
- 'apps/crawler-agent/**'
10+
- 'apps/crawler-backend/**'
11+
- 'apps/crawler-dashboard/**'
12+
- 'apps/crawler-extension/**'
913
- 'package.json'
10-
- 'package-lock.json'
14+
- 'pnpm-lock.yaml'
1115
- 'CHANGELOG.md'
1216
jobs:
1317
release:

apps/crawler-agent/.gitignore

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# Dependencies
2+
node_modules/
3+
.pnp/
4+
.pnp.js
5+
6+
# Testing
7+
coverage/
8+
.nyc_output/
9+
10+
# Production
11+
dist/
12+
build/
13+
14+
# Development
15+
.env
16+
.env.local
17+
.env.development
18+
.env.test
19+
.env.production
20+
21+
# Logs
22+
logs/
23+
*.log
24+
npm-debug.log*
25+
yarn-debug.log*
26+
yarn-error.log*
27+
28+
# Editor directories and files
29+
.idea/
30+
.vscode/
31+
*.swp
32+
*.swo
33+
.DS_Store

apps/crawler-agent/.npmignore

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# Source
2+
src/
3+
4+
# Tests
5+
tests/
6+
coverage/
7+
*.test.ts
8+
jest.config.js
9+
10+
# Development configs
11+
.github/
12+
.vscode/
13+
.env*
14+
tsconfig.json
15+
.eslintrc.js
16+
.prettierrc
17+
.gitignore
18+
19+
# Documentation source
20+
docs/
21+
22+
# Misc
23+
*.log
24+
.DS_Store
25+
node_modules/
26+
examples/
27+
28+
# Include dist and type definitions
29+
!dist/
30+
!dist/**/*.d.ts

apps/crawler-agent/.npmrc

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
audit=true
2+
fund=false
3+
package-lock=true
4+
save-exact=true
5+
engine-strict=true
6+
resolution-mode=highest

apps/crawler-agent/Dockerfile

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
FROM node:current-alpine as base
2+
3+
RUN apk add chromium
4+
5+
ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true \
6+
PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium-browser
7+
8+
ENV PNPM_HOME="/pnpm"
9+
ENV PATH="$PNPM_HOME:$PATH"
10+
RUN corepack enable
11+
12+
FROM base AS builder
13+
# Set working directory
14+
WORKDIR /app
15+
16+
RUN pnpm add -g turbo@^2
17+
COPY . .
18+
19+
# Generate a partial monorepo with a pruned lockfile for a target workspace.
20+
# Assuming "web" is the name entered in the project's package.json: { name: "web" }
21+
RUN turbo telemetry disable
22+
RUN turbo prune @mweb/crawler-agent --docker
23+
24+
# Add lockfile and package.json's of isolated subworkspace
25+
FROM base AS installer
26+
WORKDIR /app
27+
28+
# First install the dependencies (as they change less often)
29+
COPY --from=builder /app/out/json/ .
30+
RUN pnpm install
31+
32+
# Build the project
33+
COPY --from=builder /app/tsconfig.base.json ./tsconfig.base.json
34+
COPY --from=builder /app/out/full/ .
35+
RUN pnpm run build:crawler-agent
36+
37+
FROM base AS runner
38+
WORKDIR /app
39+
40+
# Don't run production as root
41+
RUN addgroup --system --gid 1001 nodejs
42+
RUN adduser --system --uid 1001 app
43+
USER app
44+
45+
COPY --from=installer --chown=app:nodejs /app ./
46+
47+
CMD node apps/crawler-agent/dist/node/cli.js start

apps/crawler-agent/jest.config.js

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
/** @type {import('ts-jest').JestConfigWithTsJest} */
2+
module.exports = {
3+
preset: 'ts-jest',
4+
testEnvironment: 'node',
5+
roots: ['<rootDir>/src', '<rootDir>/tests'],
6+
testMatch: ['**/*.test.ts'],
7+
transform: {
8+
'^.+\\.tsx?$': ['ts-jest', {
9+
tsconfig: 'tsconfig.test.json'
10+
}]
11+
},
12+
moduleNameMapper: {
13+
'^@/(.*)$': '<rootDir>/src/$1'
14+
},
15+
setupFiles: ['<rootDir>/tests/setup.ts'],
16+
collectCoverage: true,
17+
coverageDirectory: 'coverage',
18+
coverageReporters: ['text', 'lcov'],
19+
coveragePathIgnorePatterns: [
20+
'/node_modules/',
21+
'/dist/',
22+
'/coverage/',
23+
'/tests/setup.ts'
24+
],
25+
moduleFileExtensions: ['ts', 'tsx', 'js', 'jsx', 'json', 'node']
26+
};
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
apiVersion: apps/v1
2+
kind: Deployment
3+
metadata:
4+
name: crawler-agent
5+
namespace: default
6+
spec:
7+
replicas: 3
8+
selector:
9+
matchLabels:
10+
app: crawler-agent
11+
template:
12+
metadata:
13+
labels:
14+
app: crawler-agent
15+
spec:
16+
containers:
17+
- name: container-name
18+
image: ghcr.io/dapplets/crawler-agent:latest
19+
env:
20+
- name: API_URL
21+
value: "https://crawler-api.apps.dapplets.org"

apps/crawler-agent/package.json

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
{
2+
"name": "@mweb/crawler-agent",
3+
"version": "0.1.0",
4+
"main": "dist/node/index.js",
5+
"types": "dist/node/index.d.ts",
6+
"bin": {
7+
"mweb-crawler-agent": "dist/node/cli.js"
8+
},
9+
"scripts": {
10+
"start": "node dist/node/cli.js start",
11+
"build": "pnpm run clean && tsc -p tsconfig.json && chmod +x ./dist/node/cli.js && pnpm run build:inpage",
12+
"build:inpage": "webpack --mode production",
13+
"clean": "rm -rf dist",
14+
"test": "jest",
15+
"test:watch": "jest --watch",
16+
"test:coverage": "jest --coverage",
17+
"test:unit": "jest \"tests/unit/.*\\.test\\.ts$\"",
18+
"test:unit:watch": "jest \"tests/unit/.*\\.test\\.ts$\" --watch",
19+
"lint": "eslint src --ext .ts",
20+
"format": "prettier --write \"src/**/*.ts\" \"tests/**/*.ts\"",
21+
"prepublishOnly": "pnpm run build && pnpm test && pnpm run lint"
22+
},
23+
"files": [
24+
"dist",
25+
"docs"
26+
],
27+
"keywords": [
28+
"mutableweb",
29+
"dapplets",
30+
"crawler"
31+
],
32+
"author": "Mutable Web",
33+
"license": "MIT",
34+
"dependencies": {
35+
"@mweb/backend": "workspace:*",
36+
"@mweb/core": "workspace:*",
37+
"@mweb/crawler-sdk": "workspace:*",
38+
"commander": "11.1.0",
39+
"events": "^3.3.0",
40+
"node-cron": "3.0.3",
41+
"puppeteer": "23.10.4"
42+
},
43+
"devDependencies": {
44+
"@types/jest": "^29.0.0",
45+
"@types/node": "^20.0.0",
46+
"@types/node-cron": "3.0.11",
47+
"jest": "^29.0.0",
48+
"ts-jest": "^29.0.0",
49+
"typescript": "^5.0.0",
50+
"webpack": "^5.86.0",
51+
"webpack-cli": "^5.1.4"
52+
},
53+
"engines": {
54+
"node": ">=14.0.0"
55+
},
56+
"publishConfig": {
57+
"access": "public"
58+
}
59+
}

0 commit comments

Comments
 (0)