python data

Faker

sample data can be created with faker module

basics

#pip install Faker
from faker import Faker
fake = Faker()
#create fake name
fake.name()
#create fake job
fake.job()
#create fake address
fake.address()
#create fake birth date
fake.date_of_birth(minimum_age=30)
#create fake city
fake.city()

specifying location

by adding location parameter in the class Faker, location-relevant information can be generated

fake = Faker(['ja_JP','zh_CN','es_ES','en_US','fr_FR'])
for _ in range(10):
    print(fake.city())

Create Text

fake.text()
fake = Faker('ja_JP') #### Create Text from Selected Words

fake = Faker()
my_information = [
'dog','swimming', '21', 'slow', 'girl', 'coffee', 'flower','pink']
fake.sentence(ext_word_list=my_information)

Create profile

fake.profile()
#create a bunch of profile in dataframe
import pandas as pd
fake = Faker(['it_IT','ja_JP', 'zh_CN', 'de_DE','en_US'])
profiles = [fake.profile() for i in range(100)]
pd.DataFrame(profiles).head()

Create random datatype

fake.pylist(nb_elements=5, variable_nb_elements=True) ### Create decimal
fake.pydecimal(left_digits=5, right_digits=6, positive=False, min_value=None, max_value=None)

schema

pip install schema ### validate data type

from schema import Schema
#validate all data or part of data 
schema = Schema([{'name': str,
                'city': str, 
                'closeness (1-5)': int,
                'extrovert': bool,
                'favorite_temperature': float}])
#show detail valid information
schema.validate(data)
#show only true/false
schema.is_valid(data) #### validate with function

schema = Schema([{'name': str,
             'city': str, 
             'favorite_temperature': float,
             #check whether value in range of 1-5
              'closeness (1-5)': lambda n : 1 <= n <= 5,
              str: object
             }])
schema.is_valid(data)

utilize logic operator

schema = Schema([{'name': str,
            #check whether city contain one or two words, and is string
            'city': And(str, Or(lambda n: len(n.split())==2, lambda n: len(n.split()) ==1)),
            'favorite_temperature': float,
            #check whether value in range of 1-5 and is float
            'closeness (1-5)': And(lambda n : 1 <= n <= 5, float),
            str: object
            }])

Optional

make check optional if this data check not necessary for all datas

schema = Schema([{'name': str,
        'favorite_temperature': float,
        Optional('detailed_info'): {'favorite_color': str, 'phone number': str}
        }])

Forbidden

make sure a certain kind of data is not in our data

schema = Schema([{'name': str,
        'favorite_temperature': float,
        Optional('detailed_info'): {'favorite_color': str, 'phone number': str}
        }])

convert data type

Schema(Use(int)).validate('123')

others

sklearn.dataset

from sklearn.datasets import make_blobs
raw_data = make_blobs(n_samples = 200, n_features = 2, centers = 4, cluster_std = 1.8)