[Tuto] Web scraping d'un Prestashop 1.6 avec Scrapy

Baba deCode · May 16, 2017

Objectif:
Permettre de récupérer les produits d'un site Prestashop en format CSV en vue d'une importation future

Disclaimer:

https://fr.wikipedia.org/wiki/Web_scraping

OS utilisé: Ubuntu 17.04
Version de scrapy 1.3.3

Version de prestashop 1.6.13

Theme: default

Instalation de scrapy (framework de web crawling en python)

dans un terminal:

sudo apt install python-pip

sudo pip install Scrapy

1) Création du projet:

scrapy startproject prestashop16

2) On édite item.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

from scrapy.item import Item, Field


class Prestashop16Item(Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    url = Field()
    balise_title = Field()
    balise_meta_description = Field()
    h1 = Field()
    reference = Field()
    quantity = Field()
    description_courte = Field()
    description_longue = Field()
    prix_ttc = Field()
    images = Field()
    main_image = Field()
    pass

3) On édite settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for prestashop16 project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'prestashop16'

SPIDER_MODULES = ['prestashop16.spiders']
NEWSPIDER_MODULE = 'prestashop16.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
# Ou on utilise google bot
# USER_AGENT = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
USER_AGENT = 'prestashop16 (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs

# On crawl une page par seconde
DOWNLOAD_DELAY = 1

# On enregistre les données dans un fichier CSV
FEED_URI = '/home/nom_utilisateur/desktop/liste_produits_prestashop.csv'
# On veut un CSV
FEED_FORMAT ='csv'
FEED_EXPORTERS_BASE = {
	'csv':'scrapy.exporters.CsvItemExporter',
}

# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'prestashop16.middlewares.Prestashop16SpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'prestashop16.middlewares.MyCustomDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
#    'prestashop16.pipelines.Prestashop16Pipeline': 300,
#}

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

4) On crée dans le dossiers spiders -> le fichier presta_bot.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from scrapy.spiders import CrawlSpider
from scrapy.linkextractors import LinkExtractor
from prestashop16.items import Prestashop16Item
from scrapy.selector import Selector
from scrapy.http import Request


class prestashop16(CrawlSpider):
    name="presta_bot"
    #on autorise seulement le crawl du site indiqué dans allowed_domains
    allowed_domains = ['demo-prestashop-16.terracode.de']
    # on definit l'id du produit de départ
    start_id_product = 1
    # on definit l'id du produit de fin
    end_id_product = 5

    #on boucle la requete sur la rangée d'id
    def start_requests(self):
        for i in range(self.start_id_product,self.end_id_product):
            yield Request('https://demo-prestashop-16.terracode.de/index.php?controller=product&id_product=%d' % i,
                    callback=self.parse_items)
    
    def parse_items(self,response):
        #récupération des datas récoltées (contenu de la page produit)
        sel = Selector(response)
 
        #on prépare item
        item = Prestashop16Item()

        item['url'] = response.url
        item['balise_title'] = sel.xpath('//title/text()').extract()
        item['balise_meta_description'] = sel.xpath('/html/head/meta[@name="description"]/@content').extract()
        item['h1'] = sel.xpath('//h1/text()').extract()
        item['reference'] = sel.xpath('//span[contains(@itemprop, "sku")]/@content').extract()
       	item['quantity'] = sel.xpath('//span[@id="quantityAvailable"]/text()').extract()
       	item['description_courte'] = sel.xpath('//div[@id="short_description_content"]//p/text()').extract()
        item['description_longue'] = sel.xpath('//section[@class="page-product-box"]//div[@class="rte"]//p/text()').extract()
        item['prix_ttc'] = sel.xpath('//span[contains(@itemprop, "price")]/@content').extract()
        item['images'] = sel.xpath('//ul[@id="thumbs_list_frame"]/li/a/@href').extract()
        item['main_image'] = sel.xpath('//div[@id="image-block"]//span[@id="view_full_size"]//img/@src').extract()

 
        # on fait passer item à la suite du processus
        yield item

5) on lance le bot

alexandre@ordi-alexandre:~/prestashop16$  scrapy crawl presta_bot

6) on recupere le csv sur le bureau

Edited May 16, 2017 by Alexandre Carette (see edit history)

Baba deCode · May 18, 2017

Objectif: Synchroniser les stocks produits (sans déclinaisons) d'un site prestashop A vers un site prestashop B

- On utilise ici l'id du produit comme clef, donc le site A et B doivent avoir les mêmes id_product

1) Installer scrapy sur votre serveur

2) On crée un nouveau projet dans /home/mes_crawlers/

scrapy startproject ps16stock

2) On édite item.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class Ps16StockItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    quantity = Field()
    id_product = Field()
    pass

3) On édite settings.py

on active ITEM_PIPELINES

# -*- coding: utf-8 -*-

# Scrapy settings for ps16stock project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'ps16stock'

SPIDER_MODULES = ['ps16stock.spiders']
NEWSPIDER_MODULE = 'ps16stock.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'ps16stock (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'ps16stock.middlewares.Ps16StockSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'ps16stock.middlewares.MyCustomDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'ps16stock.pipelines.Ps16StockPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

4) On crée dans le dossiers spiders -> le fichier stock_bot.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from scrapy.spiders import CrawlSpider
from scrapy.linkextractors import LinkExtractor
from ps16stock.items import Ps16StockItem
from scrapy.selector import Selector
from scrapy.http import Request


class Ps16Stock(CrawlSpider):
    name="stock_bot"
    #on autorise seulement le crawl du site indiqué dans allowed_domains
    allowed_domains = ['demo-prestashop-16.terracode.de']
    # on definit l'id du produit de départ
    start_id_product = 1
    # on definit l'id du produit de fin
    end_id_product = 5

    #on boucle la requete sur la rangée d'id
    def start_requests(self):
        for i in range(self.start_id_product,self.end_id_product):
            yield Request('https://demo-prestashop-16.terracode.de/index.php?controller=product&id_product=%' % i,
                    callback=self.parse_items)
    
    def parse_items(self,response):
        #récupération des datas récoltées (contenu de la page produit)
        sel = Selector(response)

        #on prépare item
        item = Ps16StockItem()
        item['id_product'] = sel.xpath('//input[@type="hidden"][@name="id_product"]/@value').extract()[0]
        item['quantity'] = sel.xpath('//span[@id="quantityAvailable"]/text()').extract()[0]


        return item

5) on édite pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

 
import MySQLdb


class Ps16StockPipeline(object):
    def __init__(self):
        print "Opening connection mysql..."
        self.conn = MySQLdb.connect(
#utilisateur mySQL
        	user='utilisateur_mysql',
#password mySQL
        	passwd='password',
#nom de la bdd
        	db='scrapy1',
#adresse du serveur mySql
        	host='localhost',
        	charset='utf8',
        	use_unicode=True
        )
        self.cursor = self.conn.cursor()
        print "Opening ok"

    def process_item(self, item, spider):

        try:
            self.cursor.execute("""UPDATE ps_stock_available SET quantity=%s WHERE id_product=%s""", (item['quantity'], item['id_product']))
            self.conn.commit()
        except MySQLdb.Error, e:
            print "Error %d: %s" % (e.args[0], e.args[1])

            return item

6) On crée un fichier bash ps16stock.sh

#!/bin/bash
cd /home/mes_crawlers/
PATH=$PATH:/usr/local/bin
export PATH
scrapy crawl stock_bot

7) on met en place une tâche cron tout les jours à 5 heure du mat

crontab -e

* 5 * * * sh /myfolder/crawlers/ps16stock.sh

Edited May 18, 2017 by Alexandre Carette (see edit history)

dandumit · June 30, 2019

On 5/17/2017 at 12:45 AM, Alexandre Carette said:

6) on recupere le csv sur le bureau

Merci beaucoup ! (that's all my French)

Please tell me , is there any way to get the translated version of site ? like translation from Chrome ?

Thank you,

Daniel

Aboumalak · February 11, 2020

Bonjour Alexandre,

Merci pour ce tuto.

J'ai suivi à la lettre les étapes, je reçois malheuresuement une erreur , :( la suivante :

scrapy crawl presta_bot
Traceback (most recent call last):
File "/usr/local/bin/scrapy", line 11, in <module>
    sys.exit(execute())
File "/usr/local/lib/python2.7/dist-packages/scrapy/cmdline.py", line 145, in execute
    cmd.crawler_process = CrawlerProcess(settings)
File "/usr/local/lib/python2.7/dist-packages/scrapy/crawler.py", line 267, in __init__
    super(CrawlerProcess, self).__init__(settings)
File "/usr/local/lib/python2.7/dist-packages/scrapy/crawler.py", line 145, in __init__
    self.spider_loader = _get_spider_loader(settings)
File "/usr/local/lib/python2.7/dist-packages/scrapy/crawler.py", line 347, in _get_spider_loader
    return loader_cls.from_settings(settings.frozencopy())
File "/usr/local/lib/python2.7/dist-packages/scrapy/spiderloader.py", line 61, in from_settings
    return cls(settings)
File "/usr/local/lib/python2.7/dist-packages/scrapy/spiderloader.py", line 25, in __init__
    self._load_all_spiders()
File "/usr/local/lib/python2.7/dist-packages/scrapy/spiderloader.py", line 47, in _load_all_spiders
    for module in walk_modules(name):
File "/usr/local/lib/python2.7/dist-packages/scrapy/utils/misc.py", line 73, in walk_modules
    submod = import_module(fullpath)
File "/usr/lib/python2.7/importlib/__init__.py", line 37, in import_module
    __import__(name)
File "/home/omar/prestashop16/prestashop16/spiders/presta_bot.py", line 5, in <module>
    from prestashop16.items import Prestashop16Item
File "/home/omar/prestashop16/prestashop16/items.py", line 11, in <module>
    class Prestashop16Item(scrapy.Item):
NameError: name 'scrapy' is not defined

Sign In

[Tuto] Web scraping d'un Prestashop 1.6 avec Scrapy

Recommended Posts

Baba deCode

Link to comment

Share on other sites

Baba deCode

Link to comment

Share on other sites

dandumit

Link to comment

Share on other sites

Aboumalak

Link to comment

Share on other sites

Create an account or sign in to comment

Create an account

Sign in

Browse

Activity

Go back to prestashop.com