mirror of
https://github.com/sbrow/nix.git
synced 2026-02-27 21:31:45 -05:00
feat: Added crawler template.
This commit is contained in:
3
templates/crawler/.envrc
Normal file
3
templates/crawler/.envrc
Normal file
@@ -0,0 +1,3 @@
|
||||
use flake
|
||||
|
||||
export PATH="$PATH:./vendor/bin"
|
||||
3
templates/crawler/.gitignore
vendored
Normal file
3
templates/crawler/.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
.direnv
|
||||
/node_modules
|
||||
/storage
|
||||
26
templates/crawler/README.md
Normal file
26
templates/crawler/README.md
Normal file
@@ -0,0 +1,26 @@
|
||||
# TopMarket Scraper
|
||||
|
||||
A web scraper built with Crawlee for JavaScript.
|
||||
|
||||
## Setup
|
||||
|
||||
1. Install dependencies:
|
||||
```bash
|
||||
npm install
|
||||
```
|
||||
|
||||
2. Run the scraper:
|
||||
```bash
|
||||
npm start
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
Edit `src/main.js` to:
|
||||
- Change the `startUrls` array to target your desired websites
|
||||
- Modify the `requestHandler` to extract the data you need
|
||||
- Adjust `maxRequestsPerCrawl` to control crawling limits
|
||||
|
||||
## Output
|
||||
|
||||
Scraped data is saved to the `storage/datasets/default` directory in JSON format.
|
||||
115
templates/crawler/flake.lock
generated
Normal file
115
templates/crawler/flake.lock
generated
Normal file
@@ -0,0 +1,115 @@
|
||||
{
|
||||
"nodes": {
|
||||
"flake-parts": {
|
||||
"inputs": {
|
||||
"nixpkgs-lib": "nixpkgs-lib"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1751413152,
|
||||
"narHash": "sha256-Tyw1RjYEsp5scoigs1384gIg6e0GoBVjms4aXFfRssQ=",
|
||||
"owner": "hercules-ci",
|
||||
"repo": "flake-parts",
|
||||
"rev": "77826244401ea9de6e3bac47c2db46005e1f30b5",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "hercules-ci",
|
||||
"repo": "flake-parts",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1757545623,
|
||||
"narHash": "sha256-mCxPABZ6jRjUQx3bPP4vjA68ETbPLNz9V2pk9tO7pRQ=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "8cd5ce828d5d1d16feff37340171a98fc3bf6526",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "NixOS",
|
||||
"ref": "nixos-25.05",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs-lib": {
|
||||
"locked": {
|
||||
"lastModified": 1751159883,
|
||||
"narHash": "sha256-urW/Ylk9FIfvXfliA1ywh75yszAbiTEVgpPeinFyVZo=",
|
||||
"owner": "nix-community",
|
||||
"repo": "nixpkgs.lib",
|
||||
"rev": "14a40a1d7fb9afa4739275ac642ed7301a9ba1ab",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nix-community",
|
||||
"repo": "nixpkgs.lib",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs-unstable": {
|
||||
"locked": {
|
||||
"lastModified": 1751949589,
|
||||
"narHash": "sha256-mgFxAPLWw0Kq+C8P3dRrZrOYEQXOtKuYVlo9xvPntt8=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "9b008d60392981ad674e04016d25619281550a9d",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "NixOS",
|
||||
"ref": "nixpkgs-unstable",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"process-compose-flake": {
|
||||
"locked": {
|
||||
"lastModified": 1749418557,
|
||||
"narHash": "sha256-wJHHckWz4Gvj8HXtM5WVJzSKXAEPvskQANVoRiu2w1w=",
|
||||
"owner": "Platonic-Systems",
|
||||
"repo": "process-compose-flake",
|
||||
"rev": "91dcc48a6298e47e2441ec76df711f4e38eab94e",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "Platonic-Systems",
|
||||
"repo": "process-compose-flake",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"flake-parts": "flake-parts",
|
||||
"nixpkgs": "nixpkgs",
|
||||
"nixpkgs-unstable": "nixpkgs-unstable",
|
||||
"process-compose-flake": "process-compose-flake",
|
||||
"treefmt-nix": "treefmt-nix"
|
||||
}
|
||||
},
|
||||
"treefmt-nix": {
|
||||
"inputs": {
|
||||
"nixpkgs": [
|
||||
"nixpkgs"
|
||||
]
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1752055615,
|
||||
"narHash": "sha256-19m7P4O/Aw/6+CzncWMAJu89JaKeMh3aMle1CNQSIwM=",
|
||||
"owner": "numtide",
|
||||
"repo": "treefmt-nix",
|
||||
"rev": "c9d477b5d5bd7f26adddd3f96cfd6a904768d4f9",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "numtide",
|
||||
"repo": "treefmt-nix",
|
||||
"type": "github"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
"version": 7
|
||||
}
|
||||
98
templates/crawler/flake.nix
Normal file
98
templates/crawler/flake.nix
Normal file
@@ -0,0 +1,98 @@
|
||||
{
|
||||
description = "A dev environment";
|
||||
|
||||
inputs = {
|
||||
nixpkgs.url = "github:NixOS/nixpkgs/nixos-25.05";
|
||||
nixpkgs-unstable.url = "github:NixOS/nixpkgs/nixpkgs-unstable";
|
||||
|
||||
flake-parts.url = "github:hercules-ci/flake-parts";
|
||||
process-compose-flake.url = "github:Platonic-Systems/process-compose-flake";
|
||||
treefmt-nix.url = "github:numtide/treefmt-nix";
|
||||
treefmt-nix.inputs.nixpkgs.follows = "nixpkgs";
|
||||
};
|
||||
|
||||
outputs =
|
||||
inputs@{ self
|
||||
, flake-parts
|
||||
, nixpkgs
|
||||
, nixpkgs-unstable
|
||||
, process-compose-flake
|
||||
, treefmt-nix
|
||||
}:
|
||||
flake-parts.lib.mkFlake { inherit inputs; } {
|
||||
imports = [
|
||||
inputs.treefmt-nix.flakeModule
|
||||
# inputs.process-compose-flake.flakeModule
|
||||
];
|
||||
systems = [ "x86_64-linux" ];
|
||||
|
||||
perSystem =
|
||||
{ pkgs, system, inputs', ... }: {
|
||||
_module.args.pkgs = import nixpkgs {
|
||||
inherit system;
|
||||
config.allowUnfree = true;
|
||||
|
||||
overlays = [
|
||||
(final: prev: { unstable = inputs'.nixpkgs-unstable.legacyPackages; })
|
||||
];
|
||||
};
|
||||
|
||||
treefmt = {
|
||||
# Used to find the project root
|
||||
projectRootFile = "flake.nix";
|
||||
settings.global.excludes = [
|
||||
".direnv/**"
|
||||
".jj/**"
|
||||
".env"
|
||||
".envrc"
|
||||
".env.local"
|
||||
];
|
||||
|
||||
|
||||
# Format nix files
|
||||
programs.nixpkgs-fmt.enable = true;
|
||||
programs.deadnix.enable = true;
|
||||
|
||||
# Format js, json, and yaml files
|
||||
programs.prettier.enable = true;
|
||||
settings.formatter.prettier =
|
||||
{
|
||||
excludes = [
|
||||
"public/**"
|
||||
"resources/js/modernizr.js"
|
||||
"storage/app/caniuse.json"
|
||||
"*.md"
|
||||
];
|
||||
};
|
||||
};
|
||||
|
||||
/*
|
||||
process-compose.default.settings.processes = {
|
||||
web.command = "sudo ${pkgs.caddy}/bin/caddy run";
|
||||
mail.command = "${pkgs.mailhog}/bin/MailHog";
|
||||
php.command = "${php}/bin/php-fpm -F -y php-fpm.conf";
|
||||
redis.command = "${$pks.redis}/bin/redis-server";
|
||||
};
|
||||
*/
|
||||
|
||||
devShells.default = pkgs.mkShell
|
||||
{
|
||||
buildInputs = with pkgs; [
|
||||
nodejs
|
||||
playwright-driver.browsers
|
||||
|
||||
# IDE
|
||||
unstable.helix
|
||||
typescript-language-server
|
||||
vscode-langservers-extracted
|
||||
];
|
||||
|
||||
shellHook = ''
|
||||
export PLAYWRIGHT_BROWSERS_PATH=${pkgs.playwright-driver.browsers}
|
||||
export PLAYWRIGHT_SKIP_VALIDATE_HOST_REQUIREMENTS=true
|
||||
export PLAYWRIGHT_CHROMIUM_EXECUTABLE_PATH="${pkgs.playwright-driver.browsers}/chromium-1169/chrome-linux/chrome"
|
||||
'';
|
||||
};
|
||||
};
|
||||
};
|
||||
}
|
||||
28
templates/crawler/index.js
Normal file
28
templates/crawler/index.js
Normal file
@@ -0,0 +1,28 @@
|
||||
import { PlaywrightCrawler, Dataset } from 'crawlee';
|
||||
|
||||
async function main() {
|
||||
const startUrls = [
|
||||
'https://ipchicken.com',
|
||||
];
|
||||
|
||||
await crawler.run(startUrls);
|
||||
}
|
||||
|
||||
const crawler = new PlaywrightCrawler({
|
||||
requestHandler: async ({ request, page, enqueueLinks, log }) => {
|
||||
const basic = await page.locator('p[align=center] b').innerText();
|
||||
|
||||
const ip = basic.split('\n')[0]
|
||||
|
||||
log.info(`Your ip is: '${ip}'`);
|
||||
},
|
||||
// maxRequestsPerCrawl: 50,
|
||||
launchContext: {
|
||||
launchOptions: {
|
||||
executablePath: process.env.PLAYWRIGHT_CHROMIUM_EXECUTABLE_PATH,
|
||||
},
|
||||
},
|
||||
headless: true,
|
||||
});
|
||||
|
||||
await main();
|
||||
3631
templates/crawler/package-lock.json
generated
Normal file
3631
templates/crawler/package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
17
templates/crawler/package.json
Normal file
17
templates/crawler/package.json
Normal file
@@ -0,0 +1,17 @@
|
||||
{
|
||||
"name": "crawler",
|
||||
"version": "1.0.0",
|
||||
"description": "",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"start": "node index.js",
|
||||
"dev": "node --watch index.js",
|
||||
"test": "echo \"Error: no test specified\" && exit 1"
|
||||
},
|
||||
"type": "module",
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"crawlee": "^3.14.1"
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user