feat: Added crawler template.

This commit is contained in:
Spencer Brower
2025-09-12 12:15:27 -04:00
parent 313051a9bd
commit a95cf11b08
8 changed files with 3921 additions and 0 deletions

3
templates/crawler/.envrc Normal file
View File

@@ -0,0 +1,3 @@
use flake
export PATH="$PATH:./vendor/bin"

3
templates/crawler/.gitignore vendored Normal file
View File

@@ -0,0 +1,3 @@
.direnv
/node_modules
/storage

View File

@@ -0,0 +1,26 @@
# TopMarket Scraper
A web scraper built with Crawlee for JavaScript.
## Setup
1. Install dependencies:
```bash
npm install
```
2. Run the scraper:
```bash
npm start
```
## Configuration
Edit `src/main.js` to:
- Change the `startUrls` array to target your desired websites
- Modify the `requestHandler` to extract the data you need
- Adjust `maxRequestsPerCrawl` to control crawling limits
## Output
Scraped data is saved to the `storage/datasets/default` directory in JSON format.

115
templates/crawler/flake.lock generated Normal file
View File

@@ -0,0 +1,115 @@
{
"nodes": {
"flake-parts": {
"inputs": {
"nixpkgs-lib": "nixpkgs-lib"
},
"locked": {
"lastModified": 1751413152,
"narHash": "sha256-Tyw1RjYEsp5scoigs1384gIg6e0GoBVjms4aXFfRssQ=",
"owner": "hercules-ci",
"repo": "flake-parts",
"rev": "77826244401ea9de6e3bac47c2db46005e1f30b5",
"type": "github"
},
"original": {
"owner": "hercules-ci",
"repo": "flake-parts",
"type": "github"
}
},
"nixpkgs": {
"locked": {
"lastModified": 1757545623,
"narHash": "sha256-mCxPABZ6jRjUQx3bPP4vjA68ETbPLNz9V2pk9tO7pRQ=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "8cd5ce828d5d1d16feff37340171a98fc3bf6526",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixos-25.05",
"repo": "nixpkgs",
"type": "github"
}
},
"nixpkgs-lib": {
"locked": {
"lastModified": 1751159883,
"narHash": "sha256-urW/Ylk9FIfvXfliA1ywh75yszAbiTEVgpPeinFyVZo=",
"owner": "nix-community",
"repo": "nixpkgs.lib",
"rev": "14a40a1d7fb9afa4739275ac642ed7301a9ba1ab",
"type": "github"
},
"original": {
"owner": "nix-community",
"repo": "nixpkgs.lib",
"type": "github"
}
},
"nixpkgs-unstable": {
"locked": {
"lastModified": 1751949589,
"narHash": "sha256-mgFxAPLWw0Kq+C8P3dRrZrOYEQXOtKuYVlo9xvPntt8=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "9b008d60392981ad674e04016d25619281550a9d",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixpkgs-unstable",
"repo": "nixpkgs",
"type": "github"
}
},
"process-compose-flake": {
"locked": {
"lastModified": 1749418557,
"narHash": "sha256-wJHHckWz4Gvj8HXtM5WVJzSKXAEPvskQANVoRiu2w1w=",
"owner": "Platonic-Systems",
"repo": "process-compose-flake",
"rev": "91dcc48a6298e47e2441ec76df711f4e38eab94e",
"type": "github"
},
"original": {
"owner": "Platonic-Systems",
"repo": "process-compose-flake",
"type": "github"
}
},
"root": {
"inputs": {
"flake-parts": "flake-parts",
"nixpkgs": "nixpkgs",
"nixpkgs-unstable": "nixpkgs-unstable",
"process-compose-flake": "process-compose-flake",
"treefmt-nix": "treefmt-nix"
}
},
"treefmt-nix": {
"inputs": {
"nixpkgs": [
"nixpkgs"
]
},
"locked": {
"lastModified": 1752055615,
"narHash": "sha256-19m7P4O/Aw/6+CzncWMAJu89JaKeMh3aMle1CNQSIwM=",
"owner": "numtide",
"repo": "treefmt-nix",
"rev": "c9d477b5d5bd7f26adddd3f96cfd6a904768d4f9",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "treefmt-nix",
"type": "github"
}
}
},
"root": "root",
"version": 7
}

View File

@@ -0,0 +1,98 @@
{
description = "A dev environment";
inputs = {
nixpkgs.url = "github:NixOS/nixpkgs/nixos-25.05";
nixpkgs-unstable.url = "github:NixOS/nixpkgs/nixpkgs-unstable";
flake-parts.url = "github:hercules-ci/flake-parts";
process-compose-flake.url = "github:Platonic-Systems/process-compose-flake";
treefmt-nix.url = "github:numtide/treefmt-nix";
treefmt-nix.inputs.nixpkgs.follows = "nixpkgs";
};
outputs =
inputs@{ self
, flake-parts
, nixpkgs
, nixpkgs-unstable
, process-compose-flake
, treefmt-nix
}:
flake-parts.lib.mkFlake { inherit inputs; } {
imports = [
inputs.treefmt-nix.flakeModule
# inputs.process-compose-flake.flakeModule
];
systems = [ "x86_64-linux" ];
perSystem =
{ pkgs, system, inputs', ... }: {
_module.args.pkgs = import nixpkgs {
inherit system;
config.allowUnfree = true;
overlays = [
(final: prev: { unstable = inputs'.nixpkgs-unstable.legacyPackages; })
];
};
treefmt = {
# Used to find the project root
projectRootFile = "flake.nix";
settings.global.excludes = [
".direnv/**"
".jj/**"
".env"
".envrc"
".env.local"
];
# Format nix files
programs.nixpkgs-fmt.enable = true;
programs.deadnix.enable = true;
# Format js, json, and yaml files
programs.prettier.enable = true;
settings.formatter.prettier =
{
excludes = [
"public/**"
"resources/js/modernizr.js"
"storage/app/caniuse.json"
"*.md"
];
};
};
/*
process-compose.default.settings.processes = {
web.command = "sudo ${pkgs.caddy}/bin/caddy run";
mail.command = "${pkgs.mailhog}/bin/MailHog";
php.command = "${php}/bin/php-fpm -F -y php-fpm.conf";
redis.command = "${$pks.redis}/bin/redis-server";
};
*/
devShells.default = pkgs.mkShell
{
buildInputs = with pkgs; [
nodejs
playwright-driver.browsers
# IDE
unstable.helix
typescript-language-server
vscode-langservers-extracted
];
shellHook = ''
export PLAYWRIGHT_BROWSERS_PATH=${pkgs.playwright-driver.browsers}
export PLAYWRIGHT_SKIP_VALIDATE_HOST_REQUIREMENTS=true
export PLAYWRIGHT_CHROMIUM_EXECUTABLE_PATH="${pkgs.playwright-driver.browsers}/chromium-1169/chrome-linux/chrome"
'';
};
};
};
}

View File

@@ -0,0 +1,28 @@
import { PlaywrightCrawler, Dataset } from 'crawlee';
async function main() {
const startUrls = [
'https://ipchicken.com',
];
await crawler.run(startUrls);
}
const crawler = new PlaywrightCrawler({
requestHandler: async ({ request, page, enqueueLinks, log }) => {
const basic = await page.locator('p[align=center] b').innerText();
const ip = basic.split('\n')[0]
log.info(`Your ip is: '${ip}'`);
},
// maxRequestsPerCrawl: 50,
launchContext: {
launchOptions: {
executablePath: process.env.PLAYWRIGHT_CHROMIUM_EXECUTABLE_PATH,
},
},
headless: true,
});
await main();

3631
templates/crawler/package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,17 @@
{
"name": "crawler",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"start": "node index.js",
"dev": "node --watch index.js",
"test": "echo \"Error: no test specified\" && exit 1"
},
"type": "module",
"author": "",
"license": "ISC",
"dependencies": {
"crawlee": "^3.14.1"
}
}