Skip to content

Commit

Permalink
feat: add support for multiple searches and discord channels
Browse files Browse the repository at this point in the history
  • Loading branch information
mevljas committed Nov 1, 2024
1 parent ed52943 commit c0a276f
Show file tree
Hide file tree
Showing 5 changed files with 127 additions and 109 deletions.
7 changes: 3 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,10 @@ cd 'C:\Program Files\Google\Chrome\Application'

```

## Run the crawler
## Configure the bot
- Add Discord bot token to the **.env** file.
- Add discord channel ids and nepremicnine.net search url pairs to the **config.txt** file.

```bash
python main.py
```

## PgAdmin (optional)

Expand Down
1 change: 1 addition & 0 deletions config.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
1294990979475963994 https://www.nepremicnine.net/oglasi-oddaja/ljubljana-mesto/stanovanje/2-sobno,2.5-sobno,3-sobno,3.5-sobno,4-sobno,4.5-sobno,5-in-vecsobno,apartma/cena-od-300-do-900-eur-na-mesec,velikost-od-30-m2/
98 changes: 51 additions & 47 deletions services/discord_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,63 +37,67 @@ async def my_background_task(self):
Background task that runs every 1 hour.
:return:
"""
channel = self.get_channel(1294990979475963994) # channel ID goes here
# await channel.send("Hello, world!")

# Setup database manager.
database_manager = DatabaseManager(
url="sqlite+aiosqlite:///nepremicnine_database.sqlite"
)

# Run the spider.
listings = await run_spider(database_manager=database_manager)

logging.debug("Found %s new listings.", len(listings))

await channel.send(f"Found {len(listings)} new listings.")

for listing in listings:
title, image_url, description, prices, size, year, floor, url = listing

logging.debug("Listing: %s", listing)

embed = discord.Embed(
title=title,
url=url,
description=description,
color=discord.Color.blue(),
)
if image_url:
embed.set_image(url=image_url)
embed.add_field(
name="**Cena**",
value=f"{prices[0]:.2f} €",
inline=True,
)
embed.add_field(
name="**Velikost**",
value=f"{size:.2f} m²",
inline=True,
)
embed.add_field(
name="**Zgrajeno leta**",
value=year,
inline=True,
)
embed.add_field(
name="**Nadstropje**",
value=floor,
inline=True,
)

if len(prices) > 1:
channel_listings = await run_spider(database_manager=database_manager)

for channel_id, listings in channel_listings.items():
logging.debug("Sending listings to channel %s.", channel_id)

channel = self.get_channel(int(channel_id)) # channel ID goes here
# await channel.send("Hello, world!")

logging.debug("Found %s new listings.", len(listings))

await channel.send(f"Found {len(listings)} new listings.")

for listing in listings:
title, image_url, description, prices, size, year, floor, url = listing

logging.debug("Listing: %s", listing)

embed = discord.Embed(
title=title,
url=url,
description=description,
color=discord.Color.blue(),
)
if image_url:
embed.set_image(url=image_url)
embed.add_field(
name="**Prejšnje cene**",
value=", ".join(f"{price:.2f} €" for price in prices[1:]),
inline=False,
name="**Cena**",
value=f"{prices[0]:.2f} €",
inline=True,
)
embed.add_field(
name="**Velikost**",
value=f"{size:.2f} m²",
inline=True,
)
embed.add_field(
name="**Zgrajeno leta**",
value=year,
inline=True,
)
embed.add_field(
name="**Nadstropje**",
value=floor,
inline=True,
)

if len(prices) > 1:
embed.add_field(
name="**Prejšnje cene**",
value=", ".join(f"{price:.2f} €" for price in prices[1:]),
inline=False,
)

await channel.send(embed=embed)
await channel.send(embed=embed)

@my_background_task.before_loop
async def before_my_task(self):
Expand Down
4 changes: 2 additions & 2 deletions services/extract_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ async def parse_page(
await browser_page.locator("xpath=//*[@id='pagination']/ul/li[4]/a").count() > 0
)

logger.info("Extracting finished.")
logger.info("Parsing page finished.")

return extracted_data, more_pages

Expand Down Expand Up @@ -123,7 +123,7 @@ async def parse_result(
url,
)

logger.info("Extracting finished.")
logger.info("Parsing result finished.")

return item_id, (
title,
Expand Down
126 changes: 70 additions & 56 deletions spider/spider.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Module that contains main spider logic."""

from collections import defaultdict

from playwright.async_api import async_playwright

from database.database_manager import DatabaseManager
Expand All @@ -13,93 +15,105 @@ async def run_spider(database_manager: DatabaseManager):
"""
logger.info("Spider started.")

# List of new listings to send to Discord.
discord_listings = []
# Dictionary to store the listings. Key is the channel name.
discord_listings = defaultdict(list)

async with async_playwright() as playwright:
# Connect to the browser.
# We need to use a real browser because of Cloudflare protection.
browser = await playwright.chromium.connect_over_cdp("http://localhost:9222")

# create a new page inside context.
browser_page = await browser.new_page()
# Read page urls from a config file.
config = await read_config()

# Prevent loading some resources for better performance.
# await browser_page.route("**/*", block_aggressively)
saved_results = await database_manager.get_listings()

page_url = (
"https://www.nepremicnine.net/oglasi-oddaja/ljubljana-mesto"
"/stanovanje/2-sobno,2.5-sobno,3-sobno,3.5-sobno,"
"4-sobno,4.5-sobno,5-in-vecsobno,apartma/cena-od-300"
"-do-900-eur-na-mesec,velikost-od-30-m2/"
)
results = {}

await browser_page.goto(page_url)
# For each url, send the results to a different channel.
for channel, page_url in config:
logger.info("Processing channel %s with URL %s", channel, page_url)

# await browser_page.pause()
# create a new page inside context.
browser_page = await browser.new_page()

saved_results = await database_manager.get_listings()
# Prevent loading some resources for better performance.
# await browser_page.route("**/*", block_aggressively)

more_pages = True
results = {}
index = 1
await browser_page.goto(page_url)

# await browser_page.pause()

more_pages = True

while more_pages:
if index > 1:
# Close the previous page.
await browser_page.close()
index = 1

# create a new page.
browser_page = await browser.new_page()
await browser_page.goto(f"{page_url}{index}/")
while more_pages:
if index > 1:
# Close the previous page.
await browser_page.close()

results_tmp, more_pages = await parse_page(browser_page=browser_page)
results.update(results_tmp)
index += 1
# create a new page.
browser_page = await browser.new_page()

for nepremicnine_id, new_data in results.items():
logger.debug("Listing ID: %s", nepremicnine_id)
await browser_page.goto(f"{page_url}{index}/")

if nepremicnine_id in saved_results:
logger.debug("Listing already saved.")
results_tmp, more_pages = await parse_page(browser_page=browser_page)
results.update(results_tmp)
index += 1

_, _, _, new_price, _, _, _, _ = new_data
for nepremicnine_id, new_data in results.items():
logger.debug("Listing ID: %s", nepremicnine_id)

listing_id, old_prices = saved_results[nepremicnine_id]
if nepremicnine_id in saved_results:
logger.debug("Listing already saved.")

if old_prices[-1] != new_price:
logger.info("New saved_price detected for %s.", nepremicnine_id)
await database_manager.add_new_price(
listing_id=listing_id,
current_price=new_price,
)
_, _, _, new_price, _, _, _, _ = new_data

print("New data before merging: ", new_data)
listing_id, old_prices = saved_results[nepremicnine_id]

# Merge old and new prices.
old_prices.append(new_price)
new_data = new_data[:3] + (old_prices,) + new_data[4:]
if old_prices[-1] != new_price:
logger.info("New saved_price detected for %s.", nepremicnine_id)
await database_manager.add_new_price(
listing_id=listing_id,
current_price=new_price,
)

print("New data after merging: ", new_data)
print("New data before merging: ", new_data)

discord_listings.append(new_data)
# Merge old and new prices.
old_prices.append(new_price)
new_data = new_data[:3] + (old_prices,) + new_data[4:]

else:
logger.debug("No new saved_price detected.")
print("New data after merging: ", new_data)

continue
discord_listings[channel].append(new_data)

# We found a new listing.
logger.info("New listing found %s.", nepremicnine_id)
else:
logger.debug("No new saved_price detected.")

await database_manager.save_listing(nepremicnine_id, new_data)
continue

# Convert price to a list of prices
new_data = new_data[:3] + ([new_data[3]],) + new_data[4:]
discord_listings.append(new_data)
await browser_page.close()
# We found a new listing.
logger.info("New listing found %s.", nepremicnine_id)

await database_manager.save_listing(nepremicnine_id, new_data)

# Convert price to a list of prices
new_data = new_data[:3] + ([new_data[3]],) + new_data[4:]
discord_listings[channel].append(new_data)
await browser_page.close()

await browser.close()
logger.info("Spider finished. Found %d new listings.", len(discord_listings))

return discord_listings


async def read_config():
"""
Read the config file.
Each line in the file contains a channel name and a URL.
"""
with open("config.txt", encoding="utf-8") as file:
return [line.strip().split() for line in file.readlines()]

0 comments on commit c0a276f

Please sign in to comment.