In [1]:
import findspark
findspark.init()
findspark.find()
import pyspark
findspark.find()
Out[1]:
'C:\\spark'
In [2]:
import math
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import lit
from pyspark.sql.types import StringType
from pyspark.sql import SQLContext
In [3]:
spark = SparkSession.builder.appName('SparkSQL_Python_Profeco').getOrCreate()
In [4]:
data_path = 'data/OPI/'
In [5]:
file_path = data_path + 'all_data.csv'
df = spark.read.format('csv').option("header", "true").load(file_path)

DATA EXPLORATION

In [6]:
df.show(10)
+--------------------+--------------------+------------------+----------------+----------------+------+--------------------+------------------+--------------------+--------------------+--------------------+----------------+--------------------+--------+----------+
|            producto|        presentacion|             marca|       categoria|        catalogo|precio|       fechaRegistro|   cadenaComercial|                giro|     nombreComercial|           direccion|          estado|           municipio| latitud|  longitud|
+--------------------+--------------------+------------------+----------------+----------------+------+--------------------+------------------+--------------------+--------------------+--------------------+----------------+--------------------+--------+----------+
|CUADERNO FORMA IT...|96 HOJAS PASTA DU...|          ESTRELLA|MATERIAL ESCOLAR|UTILES ESCOLARES|  25.9|2011-05-18 00:00:...|ABASTECEDORA LUMEN|          PAPELERIAS|ABASTECEDORA LUME...|CANNES No. 6 ESQ....|DISTRITO FEDERAL|TLALPAN          ...|19.29699|-99.125417|
|            CRAYONES|CAJA 12 CERAS. JU...|           CRAYOLA|MATERIAL ESCOLAR|UTILES ESCOLARES|  27.5|2011-05-18 00:00:...|ABASTECEDORA LUMEN|          PAPELERIAS|ABASTECEDORA LUME...|CANNES No. 6 ESQ....|DISTRITO FEDERAL|TLALPAN          ...|19.29699|-99.125417|
|            CRAYONES|CAJA 12 CERAS. TA...|           CRAYOLA|MATERIAL ESCOLAR|UTILES ESCOLARES|  13.9|2011-05-18 00:00:...|ABASTECEDORA LUMEN|          PAPELERIAS|ABASTECEDORA LUME...|CANNES No. 6 ESQ....|DISTRITO FEDERAL|TLALPAN          ...|19.29699|-99.125417|
|   COLORES DE MADERA|CAJA 12 PIEZAS LA...|          PINCELIN|MATERIAL ESCOLAR|UTILES ESCOLARES|  46.9|2011-05-18 00:00:...|ABASTECEDORA LUMEN|          PAPELERIAS|ABASTECEDORA LUME...|CANNES No. 6 ESQ....|DISTRITO FEDERAL|TLALPAN          ...|19.29699|-99.125417|
|         COLOR LARGO|CAJA 36 PIEZAS. C...|           CRAYOLA|MATERIAL ESCOLAR|UTILES ESCOLARES|   115|2011-05-18 00:00:...|ABASTECEDORA LUMEN|          PAPELERIAS|ABASTECEDORA LUME...|CANNES No. 6 ESQ....|DISTRITO FEDERAL|TLALPAN          ...|19.29699|-99.125417|
|           BOLIGRAFO|BLISTER 3 PIEZAS....|  BIC. CRISTAL GEL|MATERIAL ESCOLAR|UTILES ESCOLARES|  32.5|2011-05-18 00:00:...|ABASTECEDORA LUMEN|          PAPELERIAS|ABASTECEDORA LUME...|CANNES No. 6 ESQ....|DISTRITO FEDERAL|TLALPAN          ...|19.29699|-99.125417|
|      CINTA ADHESIVA|BOLSA 1 PIEZA. 12...|    SCOTCH 3M. 600|MATERIAL ESCOLAR|UTILES ESCOLARES|     9|2011-05-18 00:00:...|ABASTECEDORA LUMEN|          PAPELERIAS|ABASTECEDORA LUME...|CANNES No. 6 ESQ....|DISTRITO FEDERAL|TLALPAN          ...|19.29699|-99.125417|
|   COLORES DE MADERA|CAJA 24 PIEZAS. L...|    KORES. KOLORES|MATERIAL ESCOLAR|UTILES ESCOLARES|  95.9|2011-05-18 00:00:...|ABASTECEDORA LUMEN|          PAPELERIAS|ABASTECEDORA LUME...|CANNES No. 6 ESQ....|DISTRITO FEDERAL|TLALPAN          ...|19.29699|-99.125417|
|            CRAYONES|CAJA 24 CERAS. TA...|PAPER MATE. CARMEN|MATERIAL ESCOLAR|UTILES ESCOLARES|  23.2|2011-05-18 00:00:...|ABASTECEDORA LUMEN|          PAPELERIAS|ABASTECEDORA LUME...|CANNES No. 6 ESQ....|DISTRITO FEDERAL|TLALPAN          ...|19.29699|-99.125417|
|  PAN BLANCO BOLILLO|               PIEZA|               S/M|             PAN|         BASICOS|   1.2|2011-01-10 00:00:...|COMERCIAL MEXICANA|TIENDA DE AUTOSER...|COMERCIAL MEXICAN...|AV. LAGO DE GUADA...|          MÉXICO|ATIZAPAN         ...|      NA|        NA|
+--------------------+--------------------+------------------+----------------+----------------+------+--------------------+------------------+--------------------+--------------------+--------------------+----------------+--------------------+--------+----------+
only showing top 10 rows

In [7]:
df.columns
Out[7]:
['producto',
 'presentacion',
 'marca',
 'categoria',
 'catalogo',
 'precio',
 'fechaRegistro',
 'cadenaComercial',
 'giro',
 'nombreComercial',
 'direccion',
 'estado',
 'municipio',
 'latitud',
 'longitud']
In [8]:
df.count()
Out[8]:
62530715

CHECKING AND REMOVING DUPLICATES

In [9]:
df = df.drop_duplicates()
In [10]:
df.count()
Out[10]:
60753811

CHECKING AND REMOVING MISSING VALUES

In [12]:
df = df.na.drop()

How many records are there?

In [13]:
df.count()
Out[13]:
59919904
Removing the label "Estado" as a value in column estado
In [14]:
df = df.filter(df['estado'] !='estado')
In [15]:
df.count()
Out[15]:
59919903

There is records 59919903 Valid records

How many categories are there?

In [16]:
df.createOrReplaceTempView('profeco')
In [17]:
spark.sql("Select distinct categoria " 
                    "from profeco").count()
Out[17]:
40
There is 40 categories

How many trade chains are being monitored (and therefore reported in that database)?

In [18]:
spark.sql("Select distinct cadenaComercial " 
                    "from profeco").count()
Out[18]:
704
There is 705 trade chains monitored and reported in the database

What are the most monitored products in each state of the country?

In [19]:
df_sql = spark.sql("select "
    "z.estado, "
    "z.producto, "
    "z.cantidad "
"from ( "
 " select "
 " estado, "
 " max(cantidad) as top_cantidad "
 "  from "
 "  ( "
 "  select "
 "  estado, "
                    "producto, "
                    "count(*) as cantidad "
                "from "
                    "profeco "
                "group by "
                    "estado, "
                    "producto " 
            ") x "
        "group by "
            " estado "
    ") y "
    "join "
        "( "
            "select " 
                "estado, "
                "producto, "
                "count(*) as cantidad "
            "from "
                "profeco "
            "group by "
                "estado, "
                "producto "
        ") z "
    "on "  
        "y.estado = z.estado and "
        "y.top_cantidad = z.cantidad")
In [20]:
df_sql.show(100)
+--------------------+--------------------+--------+
|              estado|            producto|cantidad|
+--------------------+--------------------+--------+
| BAJA CALIFORNIA SUR|            REFRESCO|   26290|
|            GUERRERO|            REFRESCO|    8328|
|              OAXACA|LECHE ULTRAPASTEU...|   17575|
|              MÉXICO|            REFRESCO|  187179|
|             CHIAPAS|            REFRESCO|   13776|
|           QUERÉTARO|LECHE ULTRAPASTEU...|   27050|
|             MORELOS|            REFRESCO|    8939|
|             HIDALGO|            REFRESCO|   17473|
|VERACRUZ DE IGNAC...|            REFRESCO|   13674|
| COL. EDUARDO GUERRA|            REFRESCO|     237|
|             JALISCO|            REFRESCO|   79678|
|      AGUASCALIENTES|                 FUD|   11218|
|           CHIHUAHUA|   DETERGENTE P/ROPA|   14463|
|             NAYARIT|            REFRESCO|    7409|
|          TAMAULIPAS|            REFRESCO|   20867|
|             TABASCO|            REFRESCO|   27698|
|        QUINTANA ROO|                 FUD|   32733|
|            TLAXCALA|            REFRESCO|   42137|
|              PUEBLA|            REFRESCO|   30431|
|       ESQ. SUR 125"|  PAN BLANCO BOLILLO|     128|
|     SAN LUIS POTOSÍ|                 FUD|    9317|
|    DISTRITO FEDERAL|            REFRESCO|  276523|
|              COLIMA|   DETERGENTE P/ROPA|   16226|
| MICHOACÁN DE OCAMPO|   DETERGENTE P/ROPA|   38665|
|          NUEVO LEÓN|   DETERGENTE P/ROPA|   48028|
|           ZACATECAS|   DETERGENTE P/ROPA|   19739|
|             DURANGO|           LAVADORAS|    9213|
|             YUCATÁN|LECHE ULTRAPASTEU...|   35180|
|              SONORA|            REFRESCO|   32849|
|            CAMPECHE|                 FUD|   12011|
|          GUANAJUATO|            REFRESCO|   47779|
|COAHUILA DE ZARAGOZA|                 FUD|   26724|
|     BAJA CALIFORNIA|            REFRESCO|   35390|
|             SINALOA|            REFRESCO|   31369|
+--------------------+--------------------+--------+

What is the trade chain with the greatest variety of monitored products?

In [22]:
df_sql = spark.sql("select cadenaComercial, count(DISTINCT producto) as variedad "
           " from profeco "           
           "group by cadenaComercial "
           "order by variedad desc "
           "LIMIT 1 ")
df_sql.show()
+---------------+--------+
|cadenaComercial|variedad|
+---------------+--------+
|        SORIANA|    1046|
+---------------+--------+