Skip to content

Commit 7fcbce0

Browse files
committed
Initial Commit
0 parents  commit 7fcbce0

File tree

4 files changed

+807
-0
lines changed

4 files changed

+807
-0
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
node_modules
2+
.DS_Store
3+
hm/

hm_parser.js

Lines changed: 219 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,219 @@
1+
import { parse } from "node-html-parser";
2+
import { parse as csv_parser } from "csv-parse";
3+
import fs from "fs";
4+
import { stringify } from "csv-stringify";
5+
import axios from "axios";
6+
import { AxiosError } from "axios";
7+
8+
const brandDirectory = "hm";
9+
const genderDirectory = `${brandDirectory}/women`;
10+
const articlesDirectory = `${genderDirectory}/articles`;
11+
/**
12+
* Change this URL to gender view all url.
13+
*/
14+
const viewAll = "https://www2.hm.com/en_in/men/shop-by-product/view-all.html";
15+
16+
try {
17+
fs.mkdirSync(articlesDirectory, { recursive: true });
18+
} catch (e) {
19+
if (e.code != "EEXIST") throw e;
20+
}
21+
22+
const user_agents = [
23+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
24+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36",
25+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36",
26+
"Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148",
27+
"Mozilla/5.0 (Linux; Android 11; SM-G960U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.72 Mobile Safari/537.36",
28+
];
29+
30+
const instance = axios.create({
31+
headers: {
32+
Accept:
33+
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
34+
"Accept-Encoding": "gzip, deflate, br",
35+
"Accept-Language":
36+
"en-IN,en;q=0.9,hi-IN;q=0.8,hi;q=0.7,fr-FR;q=0.6,fr;q=0.5,en-US;q=0.4,en-GB;q=0.3",
37+
"Cache-Control": "no-cache",
38+
},
39+
validateStatus: function (status) {
40+
return status >= 200 && status < 300; // default
41+
},
42+
});
43+
44+
function fetchAllUrls() {
45+
instance
46+
.get(viewAll)
47+
.then(function (response) {
48+
// The API call was successful!
49+
return response.data;
50+
})
51+
.then((html) => {
52+
var doc = parse(html, "text/html");
53+
var data_total = doc
54+
.getElementsByTagName("h2")
55+
.find((h2) => h2.rawAttributes.class.includes("load-more-heading"));
56+
return data_total.attributes["data-total"];
57+
})
58+
.then(async (total) => {
59+
const columns = ["id", "category", "item_url"];
60+
const stringifier = stringify({
61+
delimiter: ",",
62+
header: true,
63+
columns: columns,
64+
});
65+
const articlesFile = `${genderDirectory}/articles.csv`;
66+
const writableStream = fs.createWriteStream(articlesFile);
67+
stringifier.pipe(writableStream);
68+
69+
var offset = 0;
70+
var pageSize = 100;
71+
while (offset < total) {
72+
const all = `${viewAll}?offset=${offset}&page-size=${pageSize}`;
73+
const response = await instance.get(all);
74+
const html = response.data;
75+
console.log(response.status);
76+
77+
var doc = parse(html, "text/html");
78+
const articleNodes = doc
79+
.getElementsByTagName("article")
80+
.filter((article) =>
81+
article.attributes.class.includes("hm-product-item")
82+
)
83+
.map((article) => {
84+
return article.childNodes.find((node) =>
85+
node?.attributes?.class.includes("image-container")
86+
);
87+
});
88+
89+
const data = articleNodes.map((article) => {
90+
const aTag = article.childNodes.find(
91+
(node) => node.rawTagName === "a"
92+
);
93+
return [
94+
article.parentNode.attributes["data-articlecode"],
95+
article.parentNode.attributes["data-category"],
96+
aTag.attributes["href"],
97+
];
98+
});
99+
100+
data.forEach((row) => stringifier.write(row));
101+
offset += pageSize;
102+
}
103+
});
104+
}
105+
106+
function fetchAllArticles() {
107+
const articlesFile = `${genderDirectory}/articles.csv`;
108+
const readableStream = fs.createReadStream(articlesFile);
109+
110+
readableStream
111+
.pipe(
112+
csv_parser({
113+
delimiter: ",",
114+
from_line: 2,
115+
})
116+
)
117+
.on("data", (data) => {
118+
if (fs.existsSync(`${articlesDirectory}/${data[0]}.json`)) {
119+
} else {
120+
setTimeout(() => crawlArticle(data), Math.floor(Math.random() * 5000));
121+
}
122+
});
123+
}
124+
125+
function crawlArticle(article) {
126+
const isDesktop = true;
127+
128+
instance
129+
.get(`https://www2.hm.com${article[2]}`, {
130+
headers: {
131+
"User-Agent":
132+
user_agents[Math.floor(Math.random() * user_agents.length)],
133+
},
134+
})
135+
.then(function (response) {
136+
return response.data;
137+
})
138+
.then(function (html) {
139+
var doc = parse(html, "text/html");
140+
141+
const window = {
142+
innerWidth: 1080,
143+
};
144+
var script = doc
145+
.getElementsByTagName("div")
146+
.find((div) => div.attributes?.class?.includes("product parbase"))
147+
.childNodes[3]?.rawText;
148+
149+
var productAvailabilityUrl = "";
150+
const hm = {
151+
options: {
152+
product: {
153+
productAvailabilityServiceUrl: "",
154+
},
155+
pdpAccordion: "",
156+
},
157+
};
158+
159+
hm.i18n = {
160+
sustainability: {
161+
starterButton: "",
162+
modalTitle: "",
163+
},
164+
};
165+
const productArticleDetails = eval(script + "; productArticleDetails;");
166+
fs.writeFileSync(
167+
`${articlesDirectory}/${article[0]}.json`,
168+
JSON.stringify(productArticleDetails)
169+
);
170+
})
171+
.catch(function (err) {
172+
// There was an error
173+
if (err instanceof AxiosError) {
174+
console.log(err.code, data[0]);
175+
}
176+
});
177+
}
178+
///
179+
fetchAllUrls();
180+
fetchAllArticles();
181+
182+
// const allIds = [];
183+
// const articlesFile = `${genderDirectory}/articles.csv`;
184+
// const readableStream = fs.createReadStream(articlesFile);
185+
186+
// readableStream
187+
// .pipe(
188+
// csv_parser({
189+
// delimiter: ",",
190+
// from_line: 2,
191+
// })
192+
// )
193+
// .on("data", (data) => {
194+
// allIds.push(data[0]);
195+
// })
196+
// .on("finish", () => {
197+
// const allFiles = [];
198+
// fs.readdir(`${articlesDirectory}`, function (err, files) {
199+
// //handling error
200+
// if (err) {
201+
// return console.log("Unable to scan directory: " + err);
202+
// }
203+
// //listing all files using forEach
204+
// files.forEach(function (file) {
205+
// // Do whatever you want to do with the file
206+
// const fileName = file.split(".json");
207+
// allFiles.push(fileName[0]);
208+
// });
209+
210+
// var set = new Set(allIds);
211+
// console.log(set.size);
212+
// let unique1 = allIds.filter((o) => allFiles.indexOf(o) === -1);
213+
// let unique2 = allFiles.filter((o) => allIds.indexOf(o) === -1);
214+
// const unique = unique1.concat(unique2);
215+
216+
// console.log(allIds.length, allFiles.length);
217+
// console.log(unique);
218+
// });
219+
// });

0 commit comments

Comments
 (0)