2
\$\begingroup\$

I wrote a python script to scrape google maps for my app. I really want my code to be readable and have tried to follow PEP-8 wherever I could, so I have come to you all for guidance. It uses selenium webdriver to visit the google maps website in headless chrome. Then, it retrieves the page source and parses it with beautiful soup. I put it all into a class because classes are beautiful in my opinion and I love classes.

I want to make my code cleaner and more understandable, and any kind of suggestions are appreciated. Here is my code:

from time import sleep
from io import open
from selenium.webdriver import Chrome, ChromeOptions
from bs4 import BeautifulSoup


class Scraper:
    def __init__(self, query: str, latitude: float, longitude: float):
        self.options = ChromeOptions()
        options.add_argument("--headless")
        options.add_argument("--incognito")
        self.browser = Chrome(options = self.options)
        self.browser.get(f"https://www.google.com/maps/search/{query}/@{latitude},{longitude},15z/")
        sleep(0.1)

    def run(self):
        data = []
        soup = BeautifulSoup(self.browser.page_source, "html.parser")
        self.browser.quit()

        for business in soup.find_all("a", {"class": "a4gq8e-aVTXAb-haAclf-jRmmHf-hSRGPd"}):
            business_data = {}
            print(f"Scraping {business["aria-label"]}...")
            print(f"The URL to {business["aria-label"]} is at {business["href"]}")
            business_data["Name"] = business["aria-label"]
            business_data["URL"] = business["href"]

            temp_browser = Chrome(options = self.options)
            temp_browser.get(business["href"])
            sleep(0.1)
            temp_soup = BeautifulSoup(temp_browser.page_source, "html.parser")
            temp_browser.quit()
            print(f"Scraped {business["aria-label"]}! Now parsing it...")

            print(f"Scraping phone number of {business["aria-label"]}...")
            phone_number = temp_soup.find_all("div", {"class": "QSFF4-text gm2-body-2"})
            try:
                business_data["Phone Number"] = phone_number[0].get_text()
            except:
                business_data["Phone Number"] = None

            print(f"Scraping rating of {business["aria-label"]}...")
            rating = temp_soup.find_all("span", {"class": "aMPvhf-fI6EEc-KVuj8d"})
            try:
                business_data["Rating"] = rating[0].get_text()
            except:
                business_data["Rating"] = None

            print(f"Scraping number of reviews for {business["aria-label"]}...")
            num_reviews = temp_soup.find_all("button", {"class": "gm2-button-alt HHrUdb-v3pZbf"})
            try:
                business_data["Number of reviews"] = num_reviews[0].get_text()
            except:
                business_data["Number of reviews"] = None
            
            data.append(business_data)
        
        print("Finished entire scraping successfully!")
        self.data = dumps(data)
        return self.data

    def save(self, savepath: str):
        with open(savepath, "w") as json_writer:
            json_writer.write(self.data)

scraper = Scraper("transistors", 18.5523618, 73.826655)
scraper.run()
scraper.save("data.json")
\$\endgroup\$
1
  • \$\begingroup\$ I have tried to test your code, but I get errors and it doesn't open. Is it possible that it is not updated to work with the new chrome.driver? If so, how about updating it? Or if possible, can you give me more precise instructions for the installation so I can reproduce and help to improve this script? \$\endgroup\$
    – Maria_h
    Commented Apr 8, 2022 at 20:19

0

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.