Help scrapy /u/Fiatsheee Python Education

Hi, For a school project I am scraping the IMDB site and I need to scrape the genre.

https://preview.redd.it/7livs3g7wube1.png?width=1742&format=png&auto=webp&s=fe71deb9aed689258d84a4cf80e0ed07e22b7223

This is the element sectie where the genre is stated.

However with different codes I still can not scrape the genre.

Can u guys maybe help me out?

Code I have currently:

import scrapy from selenium import webdriver from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options import time import re class ImdbSpider(scrapy.Spider): name = 'imdb_spider' allowed_domains = ['imdb.com'] start_urls = ['https://www.imdb.com/chart/top/?ref_=nv_mv_250'] def __init__(self, *args, **kwargs): super(ImdbSpider, self).__init__(*args, **kwargs) chrome_options = Options() chrome_options.binary_location = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" # Mac location self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options) def parse(self, response): self.driver.get(response.url) time.sleep(5) # Give time for page to load completely # Step 1: Extract the links to the individual film pages movie_links = self.driver.find_elements(By.CSS_SELECTOR, 'a.ipc-lockup-overlay') seen_urls = set() # Initialize a set to track URLs we've already seen for link in movie_links: full_url = link.get_attribute('href') # Get the full URL of each movie link if full_url.startswith("https://www.imdb.com/title/tt") and full_url not in seen_urls: seen_urls.add(full_url) yield scrapy.Request(full_url, callback=self.parse_movie) def parse_movie(self, response): # Extract data from the movie page title = response.css('h1 span::text').get().strip() genre = response.css('li[data-testid="storyline-genres"] a::text').get() # Extract the release date text and apply regex to get "Month Day, Year" release_date_text = response.css('a[href*="releaseinfo"]::text').getall() release_date_text = ' '.join(release_date_text).strip() # Use regex to extract the month, day, and year (e.g., "October 14, 1994") match = re.search(r'([A-Za-z]+ d{1,2}, d{4})', release_date_text) if match: release_date = match.group(0) # This gives the full date "October 14, 1994" else: release_date = 'Not found' # Extract the director's name director = response.css('a.ipc-metadata-list-item__list-content-item--link::text').get() # Extract the actors' names actors = response.css('a[data-testid="title-cast-item__actor"]::text').getall() yield { 'title': title, 'genre': genre, 'release_date': release_date, 'director': director, 'actors': actors, 'url': response.url } def closed(self, reason): # Close the browser after scraping is complete self.driver.quit() 

submitted by /u/Fiatsheee
[link] [comments]

​r/learnpython Hi, For a school project I am scraping the IMDB site and I need to scrape the genre. https://preview.redd.it/7livs3g7wube1.png?width=1742&format=png&auto=webp&s=fe71deb9aed689258d84a4cf80e0ed07e22b7223 This is the element sectie where the genre is stated. However with different codes I still can not scrape the genre. Can u guys maybe help me out? Code I have currently: import scrapy from selenium import webdriver from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options import time import re class ImdbSpider(scrapy.Spider): name = ‘imdb_spider’ allowed_domains = [‘imdb.com’] start_urls = [‘https://www.imdb.com/chart/top/?ref_=nv_mv_250’] def __init__(self, *args, **kwargs): super(ImdbSpider, self).__init__(*args, **kwargs) chrome_options = Options() chrome_options.binary_location = “/Applications/Google Chrome.app/Contents/MacOS/Google Chrome” # Mac location self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options) def parse(self, response): self.driver.get(response.url) time.sleep(5) # Give time for page to load completely # Step 1: Extract the links to the individual film pages movie_links = self.driver.find_elements(By.CSS_SELECTOR, ‘a.ipc-lockup-overlay’) seen_urls = set() # Initialize a set to track URLs we’ve already seen for link in movie_links: full_url = link.get_attribute(‘href’) # Get the full URL of each movie link if full_url.startswith(“https://www.imdb.com/title/tt”) and full_url not in seen_urls: seen_urls.add(full_url) yield scrapy.Request(full_url, callback=self.parse_movie) def parse_movie(self, response): # Extract data from the movie page title = response.css(‘h1 span::text’).get().strip() genre = response.css(‘li[data-testid=”storyline-genres”] a::text’).get() # Extract the release date text and apply regex to get “Month Day, Year” release_date_text = response.css(‘a[href*=”releaseinfo”]::text’).getall() release_date_text = ‘ ‘.join(release_date_text).strip() # Use regex to extract the month, day, and year (e.g., “October 14, 1994”) match = re.search(r'([A-Za-z]+ d{1,2}, d{4})’, release_date_text) if match: release_date = match.group(0) # This gives the full date “October 14, 1994″ else: release_date = ‘Not found’ # Extract the director’s name director = response.css(‘a.ipc-metadata-list-item__list-content-item–link::text’).get() # Extract the actors’ names actors = response.css(‘a[data-testid=”title-cast-item__actor”]::text’).getall() yield { ‘title’: title, ‘genre’: genre, ‘release_date’: release_date, ‘director’: director, ‘actors’: actors, ‘url’: response.url } def closed(self, reason): # Close the browser after scraping is complete self.driver.quit() submitted by /u/Fiatsheee [link] [comments] 

Hi, For a school project I am scraping the IMDB site and I need to scrape the genre.

https://preview.redd.it/7livs3g7wube1.png?width=1742&format=png&auto=webp&s=fe71deb9aed689258d84a4cf80e0ed07e22b7223

This is the element sectie where the genre is stated.

However with different codes I still can not scrape the genre.

Can u guys maybe help me out?

Code I have currently:

import scrapy from selenium import webdriver from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options import time import re class ImdbSpider(scrapy.Spider): name = 'imdb_spider' allowed_domains = ['imdb.com'] start_urls = ['https://www.imdb.com/chart/top/?ref_=nv_mv_250'] def __init__(self, *args, **kwargs): super(ImdbSpider, self).__init__(*args, **kwargs) chrome_options = Options() chrome_options.binary_location = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" # Mac location self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options) def parse(self, response): self.driver.get(response.url) time.sleep(5) # Give time for page to load completely # Step 1: Extract the links to the individual film pages movie_links = self.driver.find_elements(By.CSS_SELECTOR, 'a.ipc-lockup-overlay') seen_urls = set() # Initialize a set to track URLs we've already seen for link in movie_links: full_url = link.get_attribute('href') # Get the full URL of each movie link if full_url.startswith("https://www.imdb.com/title/tt") and full_url not in seen_urls: seen_urls.add(full_url) yield scrapy.Request(full_url, callback=self.parse_movie) def parse_movie(self, response): # Extract data from the movie page title = response.css('h1 span::text').get().strip() genre = response.css('li[data-testid="storyline-genres"] a::text').get() # Extract the release date text and apply regex to get "Month Day, Year" release_date_text = response.css('a[href*="releaseinfo"]::text').getall() release_date_text = ' '.join(release_date_text).strip() # Use regex to extract the month, day, and year (e.g., "October 14, 1994") match = re.search(r'([A-Za-z]+ d{1,2}, d{4})', release_date_text) if match: release_date = match.group(0) # This gives the full date "October 14, 1994" else: release_date = 'Not found' # Extract the director's name director = response.css('a.ipc-metadata-list-item__list-content-item--link::text').get() # Extract the actors' names actors = response.css('a[data-testid="title-cast-item__actor"]::text').getall() yield { 'title': title, 'genre': genre, 'release_date': release_date, 'director': director, 'actors': actors, 'url': response.url } def closed(self, reason): # Close the browser after scraping is complete self.driver.quit() 

submitted by /u/Fiatsheee
[link] [comments] 

Leave a Reply

Your email address will not be published. Required fields are marked *