I would like to scrap datas from this website on google sheets. I have an error saying imported content is empty by using this syntax :
=IMPORTXML("https://lokia.zohorecruit.com/jobs/Careers";"//*[@id='website_block_jobs']/career-website-job-layout3/div/div[3]/div/div[3]/div[2]/div[1]/h3/a")
I would like to scrap only the title of the function (in one column) and the date in another column.
What is not correct with my synthax ? Can you help me please
CodePudding user response:
in this case, there is no correct syntax. JavaScript elements are not supported by any IMPORT formula of google sheets. you can always backcheck it if you disable JS for a given site and see if there is anything left to be scrapped
CodePudding user response:
The careers data can be obtained by using Google Apps Script. The data is stored in a hidden field and can be selected using a regular expression. There are pitfalls to this approach and it's probably more advisable to use a standard parser that might be found in Cheerio or JSDOM. Having said that, the code is below:
const ENTITY_CODES = {
" ": " ",
"!":"!",
""":"\"",
"#":"#",
"$":"$",
"%":"%",
"&":"&",
"'":"'",
"(":"(",
")":")",
"*":"*",
"+":" ",
",":":",
"-":"-",
".":".",
"/":"/",
":":":",
";":";",
"<":"<",
"=":"=",
">":">",
"?":"?",
"@":"@",
"[":"[",
"\":"\\",
"]":"]",
"^":"^",
"_":"_",
"`":"`",
"{":"{",
"|":"|",
"}":"}",
"~":"~",
""":'"',
"&":"&",
"<":"<",
">":">"
};
async function getData() {
const data = await (await UrlFetchApp.fetch('https://lokia.zohorecruit.com/jobs/Careers')).getContentText();
const careersMatches = data.match(/value="([^"]*)" id="jobs"/mi);
let careersData;
if (careersMatches && careersMatches[1]) {
careersData = careersMatches[1];
for (let [entityCode, entityValue] of Object.entries(ENTITY_CODES)) {
const regex = new RegExp(entityCode, 'gim');
careersData = careersData.replace(regex, entityValue);
};
}
careersData = JSON.parse(careersData);
return careersData;
}
async function onOpen() {
const careersData = await getData();
}

