Python – Scrapy & Selenium Store data to MySQL

Flashdeal.py (spiders) :

import scrapy
from selenium import webdriver
from shutil import which
import time

class FlashdealSpider(scrapy.Spider):
    name = 'flashdeal'
    allowed_domains = ['www.tokopedia.com']
    start_urls = ['http://www.tokopedia.com/discovery/kejar-diskon']

    def parse(self, response):

        chrome_path = which("chromedriver")

        driver = webdriver.Chrome(executable_path=chrome_path)
        driver.get("https://www.tokopedia.com/discovery/kejar-diskon")

        # time.sleep(3)
        # driver.refresh()
        # tab = driver.find_elements_by_class_name("css-1dfa2xf")
        # tab[0].click()

        times = 20
        for x in range(times):
            time.sleep(3)
            driver.execute_script("window.scrollBy(400,600)")

        product_names = driver.find_elements_by_xpath("//div[@class='css-18c4yhp']")
        disc_prices = driver.find_elements_by_xpath("//div[@class='css-rhd610']")
        normal_prices = driver.find_elements_by_xpath("//div[@data-testid='lblProductSlashPrice']")
        img_urls = driver.find_elements_by_xpath("//img[@class='success fade']")
        links = driver.find_elements_by_xpath("//a[@class='pcv3__info-content css-1qnnuob']")

        count = len(product_names)

        for x in range(count):
            product_name = product_names[x].text
            disc_price = disc_prices[x].text
            normal_price = normal_prices[x].text
            img_url = img_urls[x].get_attribute('src')
            link = links[x].get_attribute('href')

            yield {
                'product_name': product_name,
                'disc_price': disc_price,
                'normal_price': normal_price,
                'image_url': img_url,
                'product_url': link
            }

Pipelines.py :

from itemadapter import ItemAdapter

import logging
import mysql.connector


class TokopediaPipeline(object):
    def open_spider(self, spider):
        print("open spider")

    def close_spider(self, spider):
        print("close spider")

    def process_item(self, item, spider):
        self.db = mysql.connector.connect(
            host="localhost",
            user="root",
            password="",
            database="db_tokopedia"
        )

        self.cursor = self.db.cursor()

        self.sql = "INSERT INTO products (product_name, disc_price, normal_price, image_url, product_url) VALUES (%s, %s, %s, %s, %s)"
        self.val = (item.get('product_name'), item.get('disc_price'), item.get('normal_price'), item.get('image_url'), item.get('product_url'))

        self.cursor.execute(self.sql, self.val)

        self.db.commit()

        return item

# you also can remove 'self' in def process_item

Settings.py :

Uncomment :

ITEM_PIPELINES = {
    'tokopedia.pipelines.TokopediaPipeline': 300,
}

add :

FEED_EXPORT_ENCODING = 'utf-8'

Leave a Reply

Your email address will not be published. Required fields are marked *