-
Notifications
You must be signed in to change notification settings - Fork 23
Expand file tree
/
Copy pathscrape.js
More file actions
124 lines (110 loc) · 5.19 KB
/
scrape.js
File metadata and controls
124 lines (110 loc) · 5.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
// Mostly used Gemini for this since I needed something quick to scrape Google and remove dead urls
import puppeteer from 'puppeteer';
import fs from 'fs';
const targetHost = 'www.google.com/maps/photometa/v1'; // This is what we're hunting
const urlsToProcess = JSON.parse(fs.readFileSync("list.json"))
async function findMetadataRequest(url) {
console.log(`[PUPPETEER] Launching browser for: ${url}`);
let browser;
try {
browser = await puppeteer.launch();
const page = await browser.newPage();
// This is the magic. We create a "promise" that
// our event listener can "resolve" when it finds the URL.
const foundUrlPromise = new Promise((resolve, reject) => {
// Set up the listener *before* we navigate
page.on('request', (request) => {
const requestUrl = request.url();
// Check if the URL is the one we want
if (requestUrl.includes(targetHost)) {
console.log(`[INTERCEPTED] Found target URL: ${requestUrl}...`);
// This is it! Stop listening and return the URL.
page.off('request');
resolve(requestUrl);
}
});
// Set a 30-second timeout for the whole operation
setTimeout(() => {
reject(new Error(`Timeout: Did not find a request to ${targetHost} within 30 seconds.`));
}, 30000);
});
// 1. Go to the page. This will trigger all the JS and network calls.
console.log(`[PUPPETEER] Navigating and waiting for requests...`);
await page.goto(url, { waitUntil: 'networkidle2' });
// 2. Wait for our promise to be resolved by the event listener.
const realUrl = await foundUrlPromise;
await browser.close();
console.log(`[SUCCESS] Captured metadata URL.`);
return realUrl;
} catch (error) {
console.error(`[ERROR] Failed to capture request:`, error.message);
if (browser) {
await browser.close();
}
return null;
}
}
import unirest from 'unirest';
// --- Run the example ---
(async () => {
processList()
})();
// 3. THE MAIN FUNCTION TO PROCESS THE LIST
async function processList() {
console.log(`Starting check for ${urlsToProcess.length} URLs...`);
const aliveUrls = [];
const deadUrls = [];
const taoUrls = [];
let links = ""
for (const item of urlsToProcess) {
const realUrl = await findMetadataRequest(item.url);
if (realUrl) {
console.log(`\nCaptured URL: ${realUrl}`);
var req = unirest('GET', realUrl)
.end(function (res) {
if (res.error) throw new Error(res.error);
// Removing tao's stupid photospheres, constantly spamming the website with stupid pictures
if(res.raw_body.includes("Táo TV")){
taoUrls.push(item);
try {
const outputData = JSON.stringify(taoUrls, null, 2); // Pretty-print JSON
fs.writeFileSync('tao_photospheres.json', outputData);
console.log('Successfully saved results to tao_photospheres.json');
} catch (err) {
console.error('Error writing to file:', err);
}
return console.log("fuck ass tao")
} else if (res.raw_body.includes("[[],[[[2],")
){
deadUrls.push(item);
try {
const outputData = JSON.stringify(deadUrls, null, 2); // Pretty-print JSON
fs.writeFileSync('dead_photospheres.json', outputData);
console.log('Successfully saved results to dead_photospheres.json');
} catch (err) {
console.error('Error writing to file:', err);
}
return console.log("dead")
} else if(res.raw_body.includes("[[],[[[1],")){
aliveUrls.push(item);
try {
const outputData = JSON.stringify(aliveUrls, null, 2); // Pretty-print JSON
fs.writeFileSync('alive_photospheres.json', outputData);
console.log('Successfully saved results to alive_photospheres.json');
} catch (err) {
console.error('Error writing to file:', err);
}
let link = JSON.parse(res.body.replace(")]}'", ""))[1][0][17][0]
links = links + `${link} ${item.id}` + "\n"
fs.writeFileSync('downloads.txt', links);
return console.log("good")
}
//console.log(res.raw_body);
});
} else {
console.log(`\nCould not capture a metadata request from that page.`);
}
}
console.log(links)
console.log(`\nProcess complete. Found ${aliveUrls.length} live URLs.`);
}