×

Welcome to TagMyCode

Please login or create account to add a snippet.
1
0
 
1
Language: Python
Posted by: ulkir
Added: Apr 29, 2020 12:14 AM
Modified: Apr 29, 2020 12:14 AM
Views: 4317
Tags: web_scraping
  1. from bs4 import BeautifulSoup as soup  # HTML data structure
  2. from urllib.request import urlopen as uReq  # Web client
  3.  
  4. # URl to web scrap from.
  5. # in this example we web scrap graphics cards from Newegg.com
  6. page_url = "http://www.newegg.com/Product/ProductList.aspx?Submit=ENE&N=-1&IsNodeId=1&Description=GTX&bop=And&Page=1&PageSize=36&order=BESTMATCH"
  7.  
  8. # opens the connection and downloads html page from url
  9. uClient = uReq(page_url)
  10.  
  11. # parses html into a soup data structure to traverse html
  12. # as if it were a json data type.
  13. page_soup = soup(uClient.read(), "html.parser")
  14. uClient.close()
  15.  
  16. # finds each product from the store page
  17. containers = page_soup.findAll("div", {"class": "item-container"})
  18.  
  19. # name the output file to write to local disk
  20. out_filename = "graphics_cards.csv"
  21. # header of csv file to be written
  22. headers = "brand,product_name,shipping \n"
  23.  
  24. # opens file, and writes headers
  25. f = open(out_filename, "w")
  26. f.write(headers)
  27.  
  28. # loops over each product and grabs attributes about
  29. # each product
  30. for container in containers:
  31.     # Finds all link tags "a" from within the first div.
  32.     make_rating_sp = container.div.select("a")
  33.  
  34.     # Grabs the title from the image title attribute
  35.     # Then does proper casing using .title()
  36.     brand = make_rating_sp[0].img["title"].title()
  37.  
  38.     # Grabs the text within the second "(a)" tag from within
  39.     # the list of queries.
  40.     product_name = container.div.select("a")[2].text
  41.  
  42.     # Grabs the product shipping information by searching
  43.     # all lists with the class "price-ship".
  44.     # Then cleans the text of white space with strip()
  45.     # Cleans the strip of "Shipping $" if it exists to just get number
  46.     shipping = container.findAll("li", {"class": "price-ship"})[0].text.strip().replace("$", "").replace(" Shipping", "")
  47.  
  48.     # prints the dataset to console
  49.     print("brand: " + brand + "\n")
  50.     print("product_name: " + product_name + "\n")
  51.     print("shipping: " + shipping + "\n")
  52.  
  53.     # writes the dataset to file
  54.     f.write(brand + ", " + product_name.replace(",", "|") + ", " + shipping + "\n")
  55.  
  56. f.close()  # Close the file
  57. ---End of code---