Get working relative xpath of product name at Shopee ecommercial site - python

I want to pull the product name at this site: https://shopee.com.my/search?keyword=h370m
I've received support of #DebanjanB at this question Selenium can not scrape Shopee e-commerce site using python but I am not able to apply the xpath of product name into that solution.
Here is my code:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('start-maximized')
options.add_argument('disable-infobars')
options.add_argument('--disable-extensions')
browserdriver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\Users\\admin\\Desktop\\chromedriver_win32\\Chromedriver')
browserdriver.get('https://shopee.com.my/search?keyword=h370m')
WebDriverWait(browserdriver, 20).until(EC.element_to_be_clickable((By.XPATH, "//div[#class='shopee-modal__container']//button[text()='English']"))).click()
print([my_element.text for my_element in WebDriverWait(browserdriver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, ".//*[#class='_1JAmkB']")))])
print("Program Ended")
Also, I tried different xpath, such as:
By.XPATH, ".//*[#class='_1JAmkB']/child::div"
or
//div[contains(concat(' ', normalize-space(#class), ' '), ' _1NoI8_ ')]
Neither of them can give me the result as expected
The output I received was just:
['', '', '', '', '', '', '', '', '', '', '', '', '', '', ''] Program Ended
Please help me to solve this problem. Thanks!

XPath:
You can use this xpath and also you need the innerHTML (not .text)
//*[#class="_1NoI8_ _2gr36I"]
And then extract the innerHTML.
print([my_element.get_attribute('innerHTML') for my_element in WebDriverWait(browserdriver, 10).until(EC.presence_of_all_elements_located((By.XPATH, '//*[#class="_1NoI8_ _2gr36I"]')))])
CSS:
print([my_element.get_attribute('innerHTML') for my_element in WebDriverWait(browserdriver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "._1NoI8_._2gr36I")))])
API:
I still think the API is better. I showed using that here. I get the names and prices each time so unsure about the issue over time you had (though I don't know how many times you have run it). With the API you don't need to scroll to generate all results.
With a short wait you can extract all data also from script tags on page:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
import json
browserdriver = webdriver.Chrome()
browserdriver.get('https://shopee.com.my/search?keyword=h370m')
WebDriverWait(browserdriver, 20).until(EC.element_to_be_clickable((By.XPATH, "//div[#class='shopee-modal__container']//button[text()='English']"))).click()
time.sleep(2)
products = [item for item in WebDriverWait(browserdriver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '[type="application/ld+json"]')))]
products_json = [product.get_attribute('innerHTML') for product in products[1:]]
names = [json.loads(product)['name'] for product in products_json] #just showing name extraction from json
len(names)

Related

Grabbing text from a list with no ID or class using Selenium

I don't understand why the list I'm trying to extract the text from is returning blanks when I'm definitely using the correct Xpath. Here is my code:
driver = webdriver.Firefox()
driver.get("https://www.omegawatches.com/watch-omega-specialities-first-omega-wrist-chronograph-51652483004001")
betweenLugs = driver.find_elements(By.XPATH, "/html/body/div[2]/main/div[3]/div/div/div[2]/div/div[2]/div[3]/div/ul/li[1]")])
print(betweenLugs.text)
This should grab the first list item and measurement
Between lugs: 20 mm
I have also tried other methods, but the fact that Xpath doesn't pick it up tells me something is wrong and it doesn't matter how I do it, I won't be able to extract the text inside the lists. Does anyone know what am I doing wrong? This is the first time I've ran into this problem.
OK, try this and see if it solves the problem:
between_lugs = driver.find_element_by_xpath("//*[contains(text(), 'Between lugs')]").get_attribute("innerHTML")
between_lugs_value = driver.find_element_by_xpath("//*[contains(text(), 'Between lugs')]/../span").get_attribute("innerHTML")
final_text = between_lugs + " " + between_lugs_value
The xpath is wrong. It fails in /div[2], it doesn't match anything. This is an example why you shouldn't use absolute path.
The section has id attribute, use it
betweenLugs = driver.find_elements(By.XPATH, "//*[#id='product-info-data-5bea7fa7406d7']/ul/li[1]")[0]
You might also want to add some wait for the loading
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
betweenLugs = WebDriverWait(driver, 10).until(expected_conditions.visibility_of_element_located((By.XPATH, "//*[#id='product-info-data-5bea7fa7406d7']/ul/li[1]")))
That page already has jQuery on it so you can just:
driver.execute_script("return jQuery('li:contains(Between lugs)').text().trim().replace(/\s+/g, ' ')")
You can fiddle with selectors in the chrome selectors, it makes it much easier.
Another simpler approach might be the following one:
from contextlib import closing
from selenium import webdriver
from selenium.webdriver.support import ui
url = "https://www.omegawatches.com/watch-omega-specialities-first-omega-wrist-chronograph-51652483004001"
with closing(webdriver.Chrome()) as wd:
wait = ui.WebDriverWait(wd, 10)
wd.get(url)
item = wait.until(lambda wd: wd.find_element_by_xpath("//*[contains(#class,'technical-data')]//li")).get_attribute('textContent')
print(' '.join(item.split()))
Output:
Between lugs: 20 mm
Using a scroll down and a wait with a css selector to target the parent li
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
driver = webdriver.Chrome() #Firefox()
driver.get("https://www.omegawatches.com/watch-omega-specialities-first-omega-wrist-chronograph-51652483004001")
driver.execute_script("window.scrollTo(0, 2000)")
betweenLugs = WebDriverWait(driver, 10).until(expected_conditions.visibility_of_element_located((By.CSS_SELECTOR, "#product-info-data-5beaf5497d916 > ul > li:nth-child(1)")))
print(betweenLugs.text)

Selenium can not scrape Shopee e-commerce site using python

I am not able to pull the price of products on Shopee (a e-commercial site).
I have taken a look at the problem solved by #dmitrybelyakov (link: Scraping AJAX e-commerce site using python) .
That solution helped me to get the 'name' of product and the 'historical_sold' but I can not get the price of the product. I can not find the price value in the Json string.
Therefore, I tried to use selenium to pull data with xpath but it appeared to be failed.
The link of the ecommercial site: https://shopee.com.my/search?keyword=h370m
My code:
import time
from selenium import webdriver
import pandas as pd
path = r'C:\Users\\admin\\Desktop\\chromedriver_win32\\Chromedriver'
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('headless')
chrome_options.add_argument('window-size=1200x600')
browserdriver = webdriver.Chrome(executable_path = path,options=chrome_options)
link='https://shopee.com.my/search?keyword=h370m'
browserdriver.get(link)
productprice='//*[#id="main"]/div/div[2]/div[2]/div/div/div/div[2]/div/div/div[2]/div[1]/div/a/div/div[2]/div[1]'
productprice_printout=browserdriver.find_element_by_xpath(productname).text
print(productprice_printout)
When I run that code, it showed the error notification like this:
selenium.common.exceptions.NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//*[#id="main"]/div/div[2]/div[2]/div/div/div/div[2]/div/div/div[2]/div[1]/div/a/div/div[2]/div[1]"}
Please help me to get the price of product on Shopee!
To extract the price of products on Shopee using Selenium and Python you can use the following solution:
Code Block:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('start-maximized')
options.add_argument('disable-infobars')
options.add_argument('--disable-extensions')
browserdriver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\WebDrivers\chromedriver.exe')
browserdriver.get('https://shopee.com.my/search?keyword=h370m')
WebDriverWait(browserdriver, 20).until(EC.element_to_be_clickable((By.XPATH, "//div[#class='shopee-modal__container']//button[text()='English']"))).click()
print([my_element.text for my_element in WebDriverWait(browserdriver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "//span[text()='RM']//following::span[1]")))])
print("Program Ended")
Console Output:
['430.00', '385.00', '435.00', '409.00', '479.00', '439.00', '479.00', '439.00', '439.00', '403.20', '369.00', '420.00', '479.00', '465.00', '465.00']
Program Ended
You can use requests and the search API for the site
import requests
headers = {
'User-Agent': 'Mozilla/5',
'Referer': 'https://shopee.com.my/search?keyword=h370m'
}
url = 'https://shopee.com.my/api/v2/search_items/?by=relevancy&keyword=h370m&limit=50&newest=0&order=desc&page_type=search'
r = requests.get(url, headers = headers).json()
for item in r['items']:
print(item['name'], ' ', item['price'])
If you want roughly the same scale:
for item in r['items']:
print(item['name'], ' ', 'RM' + str(item['price']/100000))
When visiting the website. I come across this popup https://gyazo.com/0a9cd82e2c9879a1c834a82cb15020bd. I am guessing, why selenium cannot detect the xpath you are looking for, is because this popup is blocking the element.
right after starting the selenium session, try this:
popup=browserdriver.find_element_by_xpath('//*[#id="modal"]/div[1]/div[1]/div/div[3]/button[1]')
popup.click()

Get content of table in website with Python Selenium

I am trying to get the content of a table on a website using selenium. It seems the website is set up in a rather complex manner. I can't find any element, class or content to use in the find_element_by_... functions.
If anyone has idea how to get the content of the second table starting with header Staffel, Nr., Datum, ..., Ergebnis, Bem. it would be a big help for me. I tried a lot (starting with urllib2, ...). Principally the following scripts works - loading the site and looping through high level containers. But I am not sure how to get the mentioned table content.
from selenium import webdriver
from selenium.webdriver.common.by import By
the_url = 'https://www.hvw-online.org/spielbetrieb/ergebnissetabellen/#/league?ogId=3&lId=37133&allGames=1'
driver = webdriver.Chrome()
driver.get(the_url)
elem_high = driver.find_elements(By.CLASS_NAME, 'container')
for e in elem_high:
print(e)
# what class or element to search for second table
elem_deep = driver.find_elements(By.CLASS_NAME, 'row.game')
driver.close()
Any ideas or comments are welcome. Thanks.
To get rows you have to wait for page load using WebDriverWait, you can find details here:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
the_url = 'https://www.hvw-online.org/spielbetrieb/ergebnissetabellen/#/league?ogId=3&lId=37133&allGames=1'
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
driver.get(the_url)
elem_deep = wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "table.schedule tbody > tr")))
for e in elem_deep:
print(e.text)
# Link in last column
href = e.find_element_by_css_selector("a[ng-if='row.game.sGID']").get_attribute("href")
print(href)
But better solution is using requests package to get all information from website. Code below is example how you can scrap much faster and easier:
import requests
url = 'https://spo.handball4all.de/service/if_g_json.php?ca=1&cl=37133&cmd=ps&og=3'
response = requests.get(url).json()
futureGames = response[0]["content"]["futureGames"]["games"]
for game in futureGames:
print(game["gHomeTeam"])
print(game["gGuestTeam"])
# Link in last column
print("http://spo.handball4all.de/misc/sboPublicReports.php?sGID=%s" % game["sGID"])
# You can use example of data below to get all you need
# {
# 'gID': '2799428',
# 'sGID': '671616',
# 'gNo': '61330',
# 'live': False,
# 'gToken': '',
# 'gAppid': '',
# 'gDate': '30.09.18',
# 'gWDay': 'So',
# 'gTime': '14:00',
# 'gGymnasiumID': '303',
# 'gGymnasiumNo': '6037',
# 'gGymnasiumName': 'Sporthalle beim Sportzentrum',
# 'gGymnasiumPostal': '71229',
# 'gGymnasiumTown': 'Leonberg',
# 'gGymnasiumStreet': 'Steinstra├če 18',
# 'gHomeTeam': 'SV Leonb/Elt',
# 'gGuestTeam': 'JSG Echaz-Erms 2',
# 'gHomeGoals': '33',
# 'gGuestGoals': '20',
# 'gHomeGoals_1': '19',
# 'gGuestGoals_1': '7',
# 'gHomePoints': '2',
# 'gGuestPoints': '0',
# 'gComment': ' ',
# 'gGroupsortTxt': ' ',
# 'gReferee': ' '
# }
You can use css class selector of
.schedule
That is:
table = driver.find_element_by_css_selector(".schedule")
You may need a wait before.
Then loop content
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
driver = webdriver.Chrome()
url ='https://www.hvw-online.org/spielbetrieb/ergebnissetabellen/#/league?ogId=3&lId=37133&allGames=1'
driver.get(url)
table = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR , '.schedule')))
headers = [elem.text for elem in driver.find_elements_by_css_selector('.schedule th')]
results = []
i = 1
for row in table.find_elements_by_css_selector('tr'):
if i > 1:
results.append([td.text for td in row.find_elements_by_css_selector('td')])
i+=1
df = pd.DataFrame(results, columns = headers)
print(df)
driver.quit()

How to search, arrow down and press enter with Selenium

I'm trying to search for a company, arrow down and click enter on inhersight.com
I have the following code but it doesn't seem to work:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver.get("https://www.inhersight.com/companies")
elem = driver.find_element_by_class_name("open-search.small-hide.margin-right-20.icon-36.icon-search.reverse.cursor-pointer").click()
elem.send_keys("Apple")
elem.send_keys(Keys.ARROW_DOWN)
It doesn't seem to be able to locate and find the element by the class name. I've tried many things but it still doesn't work... I'm lost
To search for a company and click enter on inhersight.com instead of as the elements are Auto Suggestions so instead of arrow down you need to induce WebDriverWait for the desired element to be clickable and you can use the following solution:
Code Block:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument('start-maximized')
options.add_argument('disable-infobars')
options.add_argument('--disable-extensions')
driver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\WebDrivers\chromedriver.exe')
driver.get("https://www.inhersight.com/companies")
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".open-search.small-hide.margin-right-20.icon-36.icon-search.reverse.cursor-pointer"))).click()
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//input[#placeholder='Search women-rated companies']"))).send_keys("Apple")
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//li[contains(#class,'select2-highlighted')]/div[#class='select2-result-label']/div[#class='weight-medium']"))).click()
You could avoid the selections as the company name becomes part of the query string in the URL, with spaces replaced by "-" and all lower case. You can therefore direct .get on this formatted URL. You can add some handling in for if company not found.
from selenium import webdriver
company = 'Apple Federal Credit Union' # 'apple'
base = 'https://www.inhersight.com/company/'
url = base + company.replace(' ', '-').lower()
d = webdriver.Chrome()
d.get(url)
#other stuff including handling of company not found (this text appears on the page so not hard)
#d.quit()

BeautifulSoup does not extract commet tags in dynamic page

What I need: Count the number of reviews under an extension in Chrome Store in all languages.
What I did: Tried BeautifulSoup to extract a certain tag. I reserched the html-code of the page and found a review tag:
Tried this code:
from bs4 import BeautifulSoup
import requests
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html5lib')
comments = soup.find_all('div', class_ = 'ba-bc-Xb ba-ua-zl-Xb')
But print(comments) shows that the array is empty.
I am stuck at the moment and I see that further I need to handle two problems:
How to cope with select language buttom? How to count reviews in all languages if by default only one language is selected.
The reviews are stored in different tabs. I read about dynamically extract it but didn't get a point.
You could use selenium to perform the tasks and waits for page changes and extract the review count from the PaginationMessage. Tested with a few links. You may need to add error handling for items with no reviews. There also seems to be some POST XHR activity yielding review JSON strings that you may wish to explore.
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
url = 'https://chrome.google.com/webstore/detail/evernote-web-clipper/pioclpoplcdbaefihamjohnefbikjilc?hl=en/'
#url = 'https://chrome.google.com/webstore/detail/https-everywhere/gcbommkclmclpchllfjekcdonpmejbdp?hl=en/'
d = webdriver.Chrome()
d.get(url)
WebDriverWait(d, 5).until(EC.visibility_of_element_located((By.ID, ':21'))).click()
ActionChains(d).click_and_hold(WebDriverWait(d, 5).until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.h-z-Ba-ca.ga-dd-Va.g-aa-ca')))).perform()
languageSelection = WebDriverWait(d, 5).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, '.g-aa-ca-ma-x-L')))
languageSelection[1].click()
s= WebDriverWait(d, 5).until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.Aa.dc-tf + span'))).text
print(s.split()[-1])
d.quit()
try this
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
driver = webdriver.Chrome()
driver.get('https://chrome.google.com/webstore/detail/evernote-web-clipper/pioclpoplcdbaefihamjohnefbikjilc?hl=en')
wait = WebDriverWait(driver, 5)
wait.until(EC.visibility_of_element_located((By.ID, ':21'))).click()
wait.until(
EC.visibility_of_element_located((By.CSS_SELECTOR, '.h-z-Ba-ca.ga-dd-Va.g-aa-ca'))
).click()
english = driver.find_element_by_xpath('//div[#class="ah-mg-j"]/span').text
print('English: ' + english.split()[-1])
wait.until(
EC.visibility_of_element_located((By.XPATH, '//div[#class="g-aa-ca-ma-x-L" and text() = "All languages"]'))
).click()
wait.until_not(EC.text_to_be_present_in_element((By.XPATH, '//div[#class="ah-mg-j"]/span'), english))
time.sleep(2)
AllCount = driver.find_element_by_xpath('//div[#class="ah-mg-j"]/span').text
print('All languages: ' + AllCount.split()[-1])
driver.close()

Resources