Skip to content

Commit

Permalink
Merge pull request #21 from AugustoEnzo/2-enable-the-history-data-for…
Browse files Browse the repository at this point in the history
…-olx-ads

Enable history about olx ads
  • Loading branch information
AugustoEnzo authored Jun 20, 2024
2 parents 91862eb + 6f2eeb4 commit 72b3303
Show file tree
Hide file tree
Showing 8 changed files with 324 additions and 36 deletions.
93 changes: 87 additions & 6 deletions src/main/java/com/fuse/crawlers/HistoricalOfOlxAds.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@
import com.fuse.sql.erm.HistoricalOfOlxAdsEntityRelationalModel;
import com.fuse.sql.erm.OlxAdEntityRelationalModel;
import com.fuse.helpers.CrawlerHelper;
import org.openqa.selenium.TimeoutException;
import org.json.JSONException;
import org.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebDriverException;
import org.openqa.selenium.firefox.FirefoxDriver;
Expand All @@ -14,31 +18,108 @@
import java.sql.Timestamp;
import java.time.Duration;
import java.time.Instant;
import java.util.ArrayList;
import java.util.Objects;
import java.util.logging.Logger;

public class HistoricalOfOlxAds implements com.fuse.sql.constants.HistoricalOfOlxAds {
// TODO Finish implementation of Historical of olx ads

public class HistoricalOfOlxAds implements com.fuse.sql.constants.HistoricalOfOlxAds, Runnable {
private static final Logger logger = Logger.getLogger(HistoricalOfOlxAds.class.getName());
private static final CrawlerHelper crawlerHelper = new CrawlerHelper();
private static final OlxAdEntityRelationalModel olxAdEntityRelationalModel = new OlxAdEntityRelationalModel();
private static final HistoricalOfOlxAdsEntityRelationalModel historicalOfOlxAdsEntityRelationalModel = new HistoricalOfOlxAdsEntityRelationalModel();
private static final WebDriver driver = new FirefoxDriver(crawlerHelper.firefoxOptions);
public static void main(String[] args) {
public void run() {

historicalOfOlxAdsEntityRelationalModel.createTable();

driver.manage().timeouts().implicitlyWait(Duration.ofSeconds(crawlingTimeout));

try (ResultSet allAdsResultSet = olxAdEntityRelationalModel.selectAllAdLinks()) {
while (allAdsResultSet.next()) {

driver.manage().deleteAllCookies();

com.fuse.sql.models.HistoricalOfOlxAds historicalOfOlxAds = new com.fuse.sql.models.HistoricalOfOlxAds();
historicalOfOlxAds.skuId = allAdsResultSet.getLong(2);
historicalOfOlxAds.link = allAdsResultSet.getString(3);
historicalOfOlxAds.skuId = allAdsResultSet.getLong(1);
historicalOfOlxAds.link = allAdsResultSet.getString(2);
historicalOfOlxAds.collectTimestamp = Timestamp.from(Instant.now());
historicalOfOlxAds.oldJson = allAdsResultSet.getObject(4, PGobject.class);
historicalOfOlxAds.oldPrice = allAdsResultSet.getDouble(7);
historicalOfOlxAds.oldImages = allAdsResultSet.getArray(8);
historicalOfOlxAds.title = allAdsResultSet.getString(5);
historicalOfOlxAds.description = allAdsResultSet.getString(6);
historicalOfOlxAds.seller = allAdsResultSet.getString(9);
historicalOfOlxAds.category = allAdsResultSet.getString(10);
historicalOfOlxAds.subcategory = allAdsResultSet.getString(11);
historicalOfOlxAds.cep = allAdsResultSet.getLong(12);
historicalOfOlxAds.city = allAdsResultSet.getString(13);
historicalOfOlxAds.neighbourhood = allAdsResultSet.getString(14);
historicalOfOlxAds.details = new PGobject();
historicalOfOlxAds.details.setType("json");
historicalOfOlxAds.details.setValue(allAdsResultSet.getObject(15, PGobject.class).getValue());

try {
driver.get(historicalOfOlxAds.link);

Document adDocument = Jsoup.parse(driver.getPageSource());
Element olxAdJson = adDocument.getElementsByAttributeValueContaining("type", adJsonTypeValue).first();

if (olxAdJson != null) {
if (Objects.requireNonNull(olxAdJson).attr("type").equals(adJsonTypeValue)) {
String tempJSON = olxAdJson.data()
.replace("\"@context\":\"https://schema.org\",\"@type\":\"Product\",", "")
.replace("\"@type\":\"ImageObject\",", "")
.replace("\"@type\":\"Offer\",", "");

if (!Objects.requireNonNull(tempJSON).equals(allAdsResultSet.getObject(4, PGobject.class).getValue())) {
historicalOfOlxAds.newJson = new PGobject();
historicalOfOlxAds.newJson.setType("json");
historicalOfOlxAds.newJson.setValue(tempJSON);
}

JSONObject jsonObject = new JSONObject(Objects.requireNonNull(tempJSON));

try {
historicalOfOlxAds.newPrice = Double.parseDouble(jsonObject
.getJSONObject("offers")
.getString("price")
.replace(",", "."));
} catch (JSONException exception) {
logger.severe(exception.toString());
}

if (Objects.requireNonNull(historicalOfOlxAds.newPrice).equals(allAdsResultSet.getDouble(7))) {
historicalOfOlxAds.newPrice = null;
}

ArrayList<Object> imagesArray = new ArrayList<>();
for (Object imageObject : jsonObject.getJSONArray("image")) {
JSONObject jsonImageObject = new JSONObject(imageObject.toString());
imagesArray.add(jsonImageObject.getString("contentUrl"));
}

if (!Objects.requireNonNull(imagesArray).equals(allAdsResultSet.getArray(8))) {
historicalOfOlxAds.newImages = historicalOfOlxAdsEntityRelationalModel.createArrayOf(imagesArray, imagesArraySQLType);
}

historicalOfOlxAds.offline = false;
}
} else {
historicalOfOlxAds.offline = true;
}

} catch (WebDriverException webDriverException) {
historicalOfOlxAds.offline = true;
historicalOfOlxAdsEntityRelationalModel.insertNewAd(historicalOfOlxAds);
logger.severe(webDriverException.toString());
}

historicalOfOlxAdsEntityRelationalModel.insertNewAd(historicalOfOlxAds);

if (historicalOfOlxAds.offline) {
olxAdEntityRelationalModel.deleteSpecificAd(historicalOfOlxAds.skuId);
}
}
} catch (SQLException sqlException) {
logger.severe(sqlException.toString());
Expand Down
17 changes: 11 additions & 6 deletions src/main/java/com/fuse/crawlers/OlxAds.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ public class OlxAds implements com.fuse.sql.constants.OlxAds, Runnable {
private static final CrawlerHelper crawlerHelper = new CrawlerHelper();
private static final OlxAdEntityRelationalModel olxAdEntityRelationalModel = new OlxAdEntityRelationalModel();
public static final WebDriver driver = new FirefoxDriver(crawlerHelper.firefoxOptions);
// public static final WebDriver driver = new FirefoxDriver();
private static JSONObject getDetailsJSON(String details) {
JSONObject detailsJSON = new JSONObject();
String[] detailsList = details.split("\n");
Expand Down Expand Up @@ -71,12 +72,12 @@ public void run() {
.replace("\"@type\":\"Offer\",", ""));
}

JSONObject javaJsonObject = new JSONObject(Objects.requireNonNull(olxAdModel.json.getValue()));
olxAdModel.title = javaJsonObject.getString("name");
olxAdModel.description = javaJsonObject.getString("description");
JSONObject jsonObject = new JSONObject(Objects.requireNonNull(olxAdModel.json.getValue()));
olxAdModel.title = jsonObject.getString("name");
olxAdModel.description = jsonObject.getString("description");

try {
olxAdModel.price = Double.parseDouble(javaJsonObject
olxAdModel.price = Double.parseDouble(jsonObject
.getJSONObject("offers")
.getString("price")
.replace(",", "."));
Expand All @@ -85,14 +86,18 @@ public void run() {
}

ArrayList<Object> imagesArray = new ArrayList<>();
for (Object imageObject : javaJsonObject.getJSONArray("image")) {
for (Object imageObject : jsonObject.getJSONArray("image")) {
JSONObject jsonImageObject = new JSONObject(imageObject.toString());
imagesArray.add(jsonImageObject.getString("contentUrl"));
}

olxAdModel.images = olxAdEntityRelationalModel.createArrayOf(imagesArray, imagesArraySQLType);
olxAdModel.category = olxAdModel.link.split("/")[4];
olxAdModel.subcategory = olxAdModel.link.split("/")[5];
try {
olxAdModel.subcategory = olxAdModel.link.split("/")[5];
} catch (ArrayIndexOutOfBoundsException e) {
logger.severe("Couldn't fetch subcategory info");
}

// Selenium driver
driver.get(olxAdModel.link);
Expand Down
13 changes: 9 additions & 4 deletions src/main/java/com/fuse/executor/Executor.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package com.fuse.executor;

import com.fuse.crawlers.HistoricalOfOlxAds;
import com.fuse.crawlers.OlxAds;
import com.fuse.crawlers.OlxAdsLinks;

Expand All @@ -13,13 +14,17 @@ public class Executor {
private static final Logger logger = Logger.getLogger(Executor.class.getName());
public static void main(String[] args) {
final ScheduledExecutorService scheduledExecutorService = Executors.newScheduledThreadPool(2);
final ExecutorService executorService = Executors.newFixedThreadPool(2);
final ExecutorService executorService = Executors.newFixedThreadPool(3);

// Execute crawling of olx ads links to the database
// scheduledExecutorService.scheduleAtFixedRate(new Thread(new OlxAdsLinks()), 0, 1, TimeUnit.HOURS);
executorService.submit(new Thread(new OlxAdsLinks()));
scheduledExecutorService.scheduleAtFixedRate(new Thread(new OlxAdsLinks()), 0, 1, TimeUnit.HOURS);

// Execute crawling of olx ads to the database
// executorService.submit(new Thread(new OlxAds()));
executorService.submit(new Thread(new OlxAds()));

// Execute the crawling olx ads historical to the database
executorService.submit(new Thread(new HistoricalOfOlxAds()));

try {
boolean termination = executorService.awaitTermination(2, TimeUnit.HOURS);
if (termination) {
Expand Down
52 changes: 47 additions & 5 deletions src/main/java/com/fuse/sql/constants/HistoricalOfOlxAds.java
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
package com.fuse.sql.constants;

public interface HistoricalOfOlxAds {
String adJsonTypeValue = "application/ld+json";
int crawlingTimeout = 3;
String imagesArraySQLType = "TEXT";
String createTableHistoricalOlxAdsQuery = """
CREATE TABLE IF NOT EXISTS historical_of_olx_ads (
id serial PRIMARY KEY,
Expand All @@ -11,7 +13,19 @@ link varchar(300) NOT NULL,
new_price double precision,
new_json JSON,
new_images TEXT[],
offline boolean
offline boolean NOT NULL,
old_price double precision,
old_json JSON,
old_images TEXT[],
title varchar(300) NOT NULL,
description text,
seller varchar(300),
category varchar(50),
subcategory varchar(100),
cep bigint,
city varchar(150),
neighbourhood varchar(100),
details JSON
);
""";

Expand All @@ -24,7 +38,19 @@ link varchar(300) NOT NULL,
new_price,
new_json,
new_images,
offline
offline,
old_price,
old_json,
old_images,
title,
description,
seller,
category,
subcategory,
cep,
city,
neighbourhood,
details
FROM
historical_of_olx_ads
WHERE
Expand All @@ -41,7 +67,19 @@ link varchar(300) NOT NULL,
new_price,
new_json,
new_images,
offline
offline,
old_price,
old_json,
old_images,
title,
description,
seller,
category,
subcategory,
cep,
city,
neighbourhood,
details
FROM
historical_of_olx_ads;
""";
Expand All @@ -51,6 +89,10 @@ link varchar(300) NOT NULL,
""";

String insertChangeIntoHistoricalOlxAdsQuery = """
INSERT INTO historical_of_olx_ads VALUES (?, ?, ?, ?, ?, ?, ?);
""";
INSERT INTO\s
historical_of_olx_ads (sku_id, link, collect_timestamp, new_price, new_json, new_images,
offline, old_price, old_json, old_images, title, description, seller, category, subcategory,
cep, city, neighbourhood, details)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);
\s""";
}
1 change: 0 additions & 1 deletion src/main/java/com/fuse/sql/constants/OlxAds.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

public interface OlxAds {
String adJsonTypeValue = "application/ld+json";
String adJsonCssSelector = String.format("head > script[type='%s']", adJsonTypeValue);
String sellerMainCssSelector = ".ad__sc-ypp2u2-4";
String sellerSecondaryCssSelector = ".sc-fBuWsC";
String locationMainCssSelector = "#location > div:nth-child(1)";
Expand Down
Loading

0 comments on commit 72b3303

Please sign in to comment.