Python – Scraping Flashdeal with Scrapy

install Scrapy

pip install scrapy

Create Project

scrapy startproject scrapping

cd scrapping

Generate Spiders

scrapy genspider banggood sea.banggood.com/Flashdeals.html

Modify file scrapping/spiders/banggood.py

Scrapping Single URL :


import scrapy


class BanggoodSpider(scrapy.Spider):
    name = 'banggood'
    allowed_domains = ['sea.banggood.com/']

    def start_requests(self):
        yield scrapy.Request(url='https://sea.banggood.com/Flashdeals.html', callback=self.parse, headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
        })

    def parse(self, response):
        contents = response.xpath("//li[@class='product-item']")
        for content in contents:
            title = content.xpath(".//a[@class='products_name']/@title").get()
            price = content.xpath(".//a[@class='price']/text()").get()
            yield {
                'title': title,
                'price': price
            }
Scrapping Multiple URL :


import scrapy


class BanggoodSpider(scrapy.Spider):
    name = 'banggood'
    allowed_domains = ['sea.banggood.com']

    def start_requests(self):
        sites = [
            'https://sea.banggood.com/Flashdeals-Featured.html',
            'https://sea.banggood.com/Deals_Health_Indoor.html#dealscategories2',
            'https://sea.banggood.com/Deals_Toys_Hobbies.html#dealscategories2',
        ]
        for site in sites:
            yield scrapy.Request(url=site, callback=self.parse, headers={
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
            })

    def parse(self, response):
        contents = response.xpath("//li[@class='product-item']")
        for content in contents:
            title = content.xpath(".//a[@class='products_name']/@title").get()
            price = content.xpath(".//a[@class='price']/text()").get()
            yield {
                'title': title,
                'price': price
            }

Run Spider

scrapy crawl banggood -o result.json

Result

you can see the results in the result.json file

Scrape Pagination

import scrapy


class BanggoodSpider(scrapy.Spider):
    name = 'banggood'
    allowed_domains = ['sea.banggood.com']

    def start_requests(self):
        main_url = 'https://sea.banggood.com/Wholesale-Bluetooth-Speakers-ca-1048.html'
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
        }
        yield scrapy.Request(url=main_url, callback=self.collect_url, headers=headers, meta={"main_url": main_url, "headers": headers})

    def collect_url(self, response):
        urls = []
        urls.append(response.meta['main_url'])

        num = response.xpath("//div[@class='num']/a/text()").getall()
        last_page = int(num[-1]) + 1
        base = 'https://sea.banggood.com/Wholesale-Bluetooth-Speakers-ca-1048-0-1-1-60-0_page'
        html = '.html'

        for page in range(2, last_page):
            page_url = base + str(page) + html
            urls.append(page_url)

        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse, headers=response.meta['headers'])

    def parse(self, response):
        contents = response.xpath("//div[@class='p-wrap']")
        for content in contents:
            title = content.xpath(".//a[@class='title']/@title").get()
            new_title = title.replace(" ", " ")
            price = content.xpath(".//span[@class='price notranslate']/@oriprice").get()
            yield {
                'title': new_title,
                'price': price
            }

Leave a Reply

Your email address will not be published. Required fields are marked *