|
| 1 | +import { parse } from "node-html-parser"; |
| 2 | +import { parse as csv_parser } from "csv-parse"; |
| 3 | +import fs from "fs"; |
| 4 | +import { stringify } from "csv-stringify"; |
| 5 | +import axios from "axios"; |
| 6 | +import { AxiosError } from "axios"; |
| 7 | + |
| 8 | +const brandDirectory = "hm"; |
| 9 | +const genderDirectory = `${brandDirectory}/women`; |
| 10 | +const articlesDirectory = `${genderDirectory}/articles`; |
| 11 | +/** |
| 12 | + * Change this URL to gender view all url. |
| 13 | + */ |
| 14 | +const viewAll = "https://www2.hm.com/en_in/men/shop-by-product/view-all.html"; |
| 15 | + |
| 16 | +try { |
| 17 | + fs.mkdirSync(articlesDirectory, { recursive: true }); |
| 18 | +} catch (e) { |
| 19 | + if (e.code != "EEXIST") throw e; |
| 20 | +} |
| 21 | + |
| 22 | +const user_agents = [ |
| 23 | + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", |
| 24 | + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36", |
| 25 | + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36", |
| 26 | + "Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148", |
| 27 | + "Mozilla/5.0 (Linux; Android 11; SM-G960U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.72 Mobile Safari/537.36", |
| 28 | +]; |
| 29 | + |
| 30 | +const instance = axios.create({ |
| 31 | + headers: { |
| 32 | + Accept: |
| 33 | + "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", |
| 34 | + "Accept-Encoding": "gzip, deflate, br", |
| 35 | + "Accept-Language": |
| 36 | + "en-IN,en;q=0.9,hi-IN;q=0.8,hi;q=0.7,fr-FR;q=0.6,fr;q=0.5,en-US;q=0.4,en-GB;q=0.3", |
| 37 | + "Cache-Control": "no-cache", |
| 38 | + }, |
| 39 | + validateStatus: function (status) { |
| 40 | + return status >= 200 && status < 300; // default |
| 41 | + }, |
| 42 | +}); |
| 43 | + |
| 44 | +function fetchAllUrls() { |
| 45 | + instance |
| 46 | + .get(viewAll) |
| 47 | + .then(function (response) { |
| 48 | + // The API call was successful! |
| 49 | + return response.data; |
| 50 | + }) |
| 51 | + .then((html) => { |
| 52 | + var doc = parse(html, "text/html"); |
| 53 | + var data_total = doc |
| 54 | + .getElementsByTagName("h2") |
| 55 | + .find((h2) => h2.rawAttributes.class.includes("load-more-heading")); |
| 56 | + return data_total.attributes["data-total"]; |
| 57 | + }) |
| 58 | + .then(async (total) => { |
| 59 | + const columns = ["id", "category", "item_url"]; |
| 60 | + const stringifier = stringify({ |
| 61 | + delimiter: ",", |
| 62 | + header: true, |
| 63 | + columns: columns, |
| 64 | + }); |
| 65 | + const articlesFile = `${genderDirectory}/articles.csv`; |
| 66 | + const writableStream = fs.createWriteStream(articlesFile); |
| 67 | + stringifier.pipe(writableStream); |
| 68 | + |
| 69 | + var offset = 0; |
| 70 | + var pageSize = 100; |
| 71 | + while (offset < total) { |
| 72 | + const all = `${viewAll}?offset=${offset}&page-size=${pageSize}`; |
| 73 | + const response = await instance.get(all); |
| 74 | + const html = response.data; |
| 75 | + console.log(response.status); |
| 76 | + |
| 77 | + var doc = parse(html, "text/html"); |
| 78 | + const articleNodes = doc |
| 79 | + .getElementsByTagName("article") |
| 80 | + .filter((article) => |
| 81 | + article.attributes.class.includes("hm-product-item") |
| 82 | + ) |
| 83 | + .map((article) => { |
| 84 | + return article.childNodes.find((node) => |
| 85 | + node?.attributes?.class.includes("image-container") |
| 86 | + ); |
| 87 | + }); |
| 88 | + |
| 89 | + const data = articleNodes.map((article) => { |
| 90 | + const aTag = article.childNodes.find( |
| 91 | + (node) => node.rawTagName === "a" |
| 92 | + ); |
| 93 | + return [ |
| 94 | + article.parentNode.attributes["data-articlecode"], |
| 95 | + article.parentNode.attributes["data-category"], |
| 96 | + aTag.attributes["href"], |
| 97 | + ]; |
| 98 | + }); |
| 99 | + |
| 100 | + data.forEach((row) => stringifier.write(row)); |
| 101 | + offset += pageSize; |
| 102 | + } |
| 103 | + }); |
| 104 | +} |
| 105 | + |
| 106 | +function fetchAllArticles() { |
| 107 | + const articlesFile = `${genderDirectory}/articles.csv`; |
| 108 | + const readableStream = fs.createReadStream(articlesFile); |
| 109 | + |
| 110 | + readableStream |
| 111 | + .pipe( |
| 112 | + csv_parser({ |
| 113 | + delimiter: ",", |
| 114 | + from_line: 2, |
| 115 | + }) |
| 116 | + ) |
| 117 | + .on("data", (data) => { |
| 118 | + if (fs.existsSync(`${articlesDirectory}/${data[0]}.json`)) { |
| 119 | + } else { |
| 120 | + setTimeout(() => crawlArticle(data), Math.floor(Math.random() * 5000)); |
| 121 | + } |
| 122 | + }); |
| 123 | +} |
| 124 | + |
| 125 | +function crawlArticle(article) { |
| 126 | + const isDesktop = true; |
| 127 | + |
| 128 | + instance |
| 129 | + .get(`https://www2.hm.com${article[2]}`, { |
| 130 | + headers: { |
| 131 | + "User-Agent": |
| 132 | + user_agents[Math.floor(Math.random() * user_agents.length)], |
| 133 | + }, |
| 134 | + }) |
| 135 | + .then(function (response) { |
| 136 | + return response.data; |
| 137 | + }) |
| 138 | + .then(function (html) { |
| 139 | + var doc = parse(html, "text/html"); |
| 140 | + |
| 141 | + const window = { |
| 142 | + innerWidth: 1080, |
| 143 | + }; |
| 144 | + var script = doc |
| 145 | + .getElementsByTagName("div") |
| 146 | + .find((div) => div.attributes?.class?.includes("product parbase")) |
| 147 | + .childNodes[3]?.rawText; |
| 148 | + |
| 149 | + var productAvailabilityUrl = ""; |
| 150 | + const hm = { |
| 151 | + options: { |
| 152 | + product: { |
| 153 | + productAvailabilityServiceUrl: "", |
| 154 | + }, |
| 155 | + pdpAccordion: "", |
| 156 | + }, |
| 157 | + }; |
| 158 | + |
| 159 | + hm.i18n = { |
| 160 | + sustainability: { |
| 161 | + starterButton: "", |
| 162 | + modalTitle: "", |
| 163 | + }, |
| 164 | + }; |
| 165 | + const productArticleDetails = eval(script + "; productArticleDetails;"); |
| 166 | + fs.writeFileSync( |
| 167 | + `${articlesDirectory}/${article[0]}.json`, |
| 168 | + JSON.stringify(productArticleDetails) |
| 169 | + ); |
| 170 | + }) |
| 171 | + .catch(function (err) { |
| 172 | + // There was an error |
| 173 | + if (err instanceof AxiosError) { |
| 174 | + console.log(err.code, data[0]); |
| 175 | + } |
| 176 | + }); |
| 177 | +} |
| 178 | +/// |
| 179 | +fetchAllUrls(); |
| 180 | +fetchAllArticles(); |
| 181 | + |
| 182 | +// const allIds = []; |
| 183 | +// const articlesFile = `${genderDirectory}/articles.csv`; |
| 184 | +// const readableStream = fs.createReadStream(articlesFile); |
| 185 | + |
| 186 | +// readableStream |
| 187 | +// .pipe( |
| 188 | +// csv_parser({ |
| 189 | +// delimiter: ",", |
| 190 | +// from_line: 2, |
| 191 | +// }) |
| 192 | +// ) |
| 193 | +// .on("data", (data) => { |
| 194 | +// allIds.push(data[0]); |
| 195 | +// }) |
| 196 | +// .on("finish", () => { |
| 197 | +// const allFiles = []; |
| 198 | +// fs.readdir(`${articlesDirectory}`, function (err, files) { |
| 199 | +// //handling error |
| 200 | +// if (err) { |
| 201 | +// return console.log("Unable to scan directory: " + err); |
| 202 | +// } |
| 203 | +// //listing all files using forEach |
| 204 | +// files.forEach(function (file) { |
| 205 | +// // Do whatever you want to do with the file |
| 206 | +// const fileName = file.split(".json"); |
| 207 | +// allFiles.push(fileName[0]); |
| 208 | +// }); |
| 209 | + |
| 210 | +// var set = new Set(allIds); |
| 211 | +// console.log(set.size); |
| 212 | +// let unique1 = allIds.filter((o) => allFiles.indexOf(o) === -1); |
| 213 | +// let unique2 = allFiles.filter((o) => allIds.indexOf(o) === -1); |
| 214 | +// const unique = unique1.concat(unique2); |
| 215 | + |
| 216 | +// console.log(allIds.length, allFiles.length); |
| 217 | +// console.log(unique); |
| 218 | +// }); |
| 219 | +// }); |
0 commit comments