Skip to content
Sales Data Analysis and Insights
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
# Simulated data representing electromagnetic waves
# Columns: ['frequency', 'amplitude', 'signal_type']
np.random.seed(42) # For reproducibility
n_samples = 1000
# Generate synthetic data
frequencies = np.random.uniform(1e3, 1e9, n_samples) # Frequencies from 1 kHz to 1 GHz
amplitudes = np.random.uniform(0.1, 10, n_samples) # Amplitudes between 0.1 and 10
signal_types = np.where(frequencies < 1e6, 'Radio',
np.where(frequencies < 1e9, 'Microwave', 'Optical'))
# Create a DataFrame
df = pd.DataFrame({
'frequency': frequencies,
'amplitude': amplitudes,
'signal_type': signal_types
})
# Features and target variable
X = df[['frequency', 'amplitude']]
y = df['signal_type']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
# Train the model
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Evaluate the model
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
# Plotting the data
plt.figure(figsize=(10, 6))
plt.scatter(df['frequency'], df['amplitude'], c=pd.Categorical(df['signal_type']).codes, cmap='viridis', alpha=0.5)
plt.xlabel('Frequency (Hz)')
plt.ylabel('Amplitude')
plt.title('Electromagnetic Wave Signals Classification')
plt.colorbar(ticks=[0, 1, 2], label='Signal Type', format=plt.FuncFormatter(lambda x, _: signal_types[x]))
plt.show()
{
"version": "v1beta",
"component": {
"text-0": {
"type": "text",
"input": {
"texts": [],
"setting": {
"clean-method": "Regex",
"exclude-patterns": [],
"include-patterns": [],
"clean-method": "Substring",
"exclude-substrings": [],
"include-substrings": [],
"case-sensitive": false
}
},
"condition": null,
"task": "TASK_CLEAN_DATA"
}
},
"$defs": {
"text": {
"description": "Text to be chunked",
"instillAcceptFormats": ["string"],
"instillUIMultiline": true,
"instillUIOrder": 0,
"instillUpstreamTypes": ["value", "reference", "template"],
"title": "Text",
"type": "string"
},
"chunk-size": {
"default": 512,
"description": "Specifies the maximum size of each chunk in terms of the number of tokens",
"instillAcceptFormats": ["integer"],
"instillUIOrder": 1,
"instillUpstreamTypes": ["value", "reference"],
"minimum": 1,
"title": "Chunk Size",
"type": "integer"
},
"chunk-overlap": {
"default": 100,
"description": "Determines the number of tokens that overlap between consecutive chunks",
"instillAcceptFormats": ["integer"],
"instillUIOrder": 2,
"instillUpstreamTypes": ["value", "reference"],
"minimum": 1,
"title": "Chunk Overlap",
"type": "integer"
},
"model-name": {
"description": "The name of the model used for tokenization.",
"enum": [
"gpt-4",
"gpt-3.5-turbo",
"text-davinci-003",
"text-davinci-002",
"text-davinci-001",
"text-curie-001",
"text-babbage-001",
"text-ada-001",
"davinci",
"curie",
"babbage",
"ada",
"code-davinci-002",
"code-davinci-001",
"code-cushman-002",
"code-cushman-001",
"davinci-codex",
"cushman-codex",
"text-davinci-edit-001",
"code-davinci-edit-001",
"text-embedding-ada-002",
"text-similarity-davinci-001",
"text-similarity-curie-001",
"text-similarity-babbage-001",
"text-similarity-ada-001",
"text-search-davinci-doc-001",
"text-search-curie-doc-001",
"text-search-babbage-doc-001",
"text-search-ada-doc-001",
"code-search-babbage-code-001",
"code-search-ada-code-001",
"gpt2"
],
"instillAcceptFormats": ["string"],
"instillUIOrder": 3,
"instillUpstreamTypes": ["value", "reference", "template"],
"title": "Model",
"type": "string"
}
},
"TASK_CHUNK_TEXT": {
"instillShortDescription": "Chunk text with different strategies",
"input": {
"description": "Input",
"instillEditOnNodeFields": ["text", "strategy"],
"instillUIOrder": 0,
"properties": {
"text": {
"$ref": "#/$defs/text"
},
"strategy": {
"description": "Chunking strategy",
"instillUIOrder": 1,
"properties": {
"setting": {
"description": "Chunk Setting",
"additionalProperties": true,
"type": "object",
"title": "Setting",
"instillUIOrder": 0,
"required": ["chunk-method"],
"oneOf": [
{
"properties": {
"chunk-method": {
"const": "Token",
"type": "string",
"title": "Chunk Method",
"description": "Chunking based on tokenization.",
"instillUIOrder": 0
},
"chunk-size": {
"$ref": "#/$defs/chunk-size"
},
"chunk-overlap": {
"$ref": "#/$defs/chunk-overlap"
},
"model-name": {
"$ref": "#/$defs/model-name"
},
"allowed-special": {
"default": [],
"description": "A list of special tokens that are allowed within chunks.",
"instillAcceptFormats": ["array:string"],
"items": {
"instillUIMultiline": false,
"type": "string"
},
"instillUIOrder": 4,
"instillUpstreamTypes": ["value", "reference", "template"],
"title": "Allowed Special Tokens",
"type": "array"
},
"disallowed-special": {
"default": [],
"description": "A list of special tokens that should not appear within chunks.",
"instillAcceptFormats": ["array:string"],
"items": {
"instillUIMultiline": false,
"type": "string"
},
"instillUIOrder": 5,
"instillUpstreamTypes": ["value", "reference", "template"],
"title": "Disallowed Special Tokens",
"type": "array"
}
},
"required": ["chunk-method"],
"instillEditOnNodeFields": [
"chunk-method",
"chunk-size",
"chunk-overlap",
"model-name",
"allowed-special",
"disallowed-special"
],
"title": "Token",
"type": "object",
"description": "Language models have a token limit. You should not exceed the token limit. When you split your text into chunks it is therefore a good idea to count the number of tokens. There are many tokenizers. When you count tokens in your text you should use the same tokenizer as used in the language model."
},
{
"properties": {
"chunk-method": {
"const": "Recursive",
"type": "string",
"title": "Chunk Method",
"description": "Chunking based on recursive splitting.",
"instillUIOrder": 0
},
"chunk-size": {
"$ref": "#/$defs/chunk-size"
},
"chunk-overlap": {
"$ref": "#/$defs/chunk-overlap"
},
"model-name": {
"$ref": "#/$defs/model-name"
},
"separators": {
"default": [],
"description": "A list of strings representing the separators used to split the text.",
"instillAcceptFormats": ["array:string"],
"instillUIOrder": 3,
"items": {
"instillUIMultiline": false,
"type": "string"
},
"instillUpstreamTypes": ["value", "reference", "template"],
"title": "Separators",
"type": "array"
},
"keep-separator": {
"description": "A flag indicating whether to keep the separator characters at the beginning or end of chunks",
"instillAcceptFormats": ["boolean"],
"instillUIOrder": 4,
"instillUpstreamTypes": ["value", "reference", "template"],
"title": "Keep Separator",
"type": "boolean"
}
},
"required": ["chunk-method"],
"instillEditOnNodeFields": [
"chunk-method",
"chunk-size",
"chunk-overlap",
"model-name",
"separators",
"keep-separator"
],
"title": "Recursive",
"type": "object",
"description": "This text splitter is the recommended one for generic text. It is parameterized by a list of characters. It tries to split on them in order until the chunks are small enough."
},
{
"properties": {
"chunk-method": {
"const": "Markdown",
"type": "string",
"title": "Chunk Method",
"description": "Chunking based on markdown text.",
"instillUIOrder": 0
},
"chunk-size": {
"$ref": "#/$defs/chunk-size"
},
"chunk-overlap": {
"$ref": "#/$defs/chunk-overlap"
},
"model-name": {
"$ref": "#/$defs/model-name"
}
},
"required": ["chunk-method"],
"instillEditOnNodeFields": [
"chunk-method",
"chunk-size",
"chunk-overlap",
"model-name"
],
"title": "Markdown",
"type": "object",
"description": "If your input is markdown, you can use this strategy to preserve the structure of your input."
}
]
}
},
"required": ["text", "strategy"],
"type": "object"
}
},
"title": "Input",
"type": "object"
}
}
}
import re
def clean_data(texts, setting):
"""
Clean data based on the specified method and patterns/substrings.
Parameters:
texts (list of str): Array of text to be cleaned.
setting (dict): Dictionary containing the cleaning method and patterns/substrings.
Returns:
list of str: Cleaned array of text.
"""
cleaned_texts = []
method = setting.get('clean-method')
if method == 'Regex':
exclude_patterns = setting.get('exclude-patterns', [])
include_patterns = setting.get('include-patterns', [])
for text in texts:
exclude = any(re.search(pattern, text) for pattern in exclude_patterns) if exclude_patterns else False
include = any(re.search(pattern, text) for pattern in include_patterns) if include_patterns else True
if not exclude and include:
cleaned_texts.append(text)
elif method == 'Substring':
exclude_substrings = setting.get('exclude-substrings', [])
include_substrings = setting.get('include-substrings', [])
case_sensitive = setting.get('case-sensitive', False)
for text in texts:
exclude = any((substring in text if case_sensitive else substring.lower() in text.lower()) for substring in exclude_substrings) if exclude_substrings else False
include = any((substring in text if case_sensitive else substring.lower() in text.lower()) for substring in include_substrings) if include_substrings else True
if not exclude and include:
cleaned_texts.append(text)
return cleaned_texts
# Example usage
texts = [
"This is a sample text.",
"Another example text.",
"Text with special pattern: 12345",
"Text to be excluded: abcde"
]
# Using Regex method
setting_regex = {
'clean-method': 'Regex',
'exclude-patterns': [r'\d{5}'], # Exclude texts with 5 digits
'include-patterns': [r'Text'] # Include texts with the word 'Text'
}
cleaned_texts_regex = clean_data(texts, setting_regex)
# Using Substring method
setting_substring = {
'clean-method': 'Substring',
'exclude-substrings': ['abcde'], # Exclude texts containing 'abcde'
'include-substrings': ['sample'], # Include texts containing 'sample'
'case-sensitive': False
}
cleaned_texts_substring = clean_data(texts, setting_substring)
print("Cleaned Texts (Regex):", cleaned_texts_regex)
print("Cleaned Texts (Substring):", cleaned_texts_substring)from collections import deque
def process_numbers(nums):
queue = deque([10, 20, 30])
for num in nums:
if num % 2 == 0:
queue.append(num)
elif queue and num % 2 != 0:
queue.popleft()
return list(queue)
nums = [2, 3, 4, 5, 6, 7]
print(process_numbers(nums))def process_strings(chars):
stack = ["start"]
for char in chars:
if char.isupper():
stack.append(char)
elif stack and char.islower():
stack.pop()
return stack
chars = ['A', 'b', 'c', 'D', 'E', 'f']
print(process_strings(chars))def mystery_function(nums):
left = len(nums) - 1
right = len(nums) - 1
while right >= 0:
if nums[right] != 0:
temp = nums[right]
nums[right] = nums[left]
nums[left] = temp
left -= 1
right -= 1
return nums
nums = [0, 0, 1, 2, 0, 3]
print(mystery_function(nums))import pandas as pd
# Load the dataset
file_path = './Sales Data.csv'
sales_data = pd.read_csv(file_path)
# Display the first few rows of the dataset
sales_data.head()import plotly.express as px
# Aggregate total sales for each month
monthly_sales = sales_data.groupby('Month')['Sales'].sum().reset_index()
# Plot the total sales for each month
fig = px.line(monthly_sales, x='Month', y='Sales', title='Total Sales per Month', labels={'Sales': 'Total Sales', 'Month': 'Month'})
fig.show()# Aggregate total sales for each product
product_sales = sales_data.groupby('Product')['Sales'].sum().reset_index().sort_values(by='Sales', ascending=False)
# Display the top-selling products
product_sales.head(10)# Aggregate total sales for each city
city_sales = sales_data.groupby('City')['Sales'].sum().reset_index().sort_values(by='Sales', ascending=False)
# Plot the total sales for each city
fig = px.bar(city_sales, x='City', y='Sales', title='Total Sales by City', labels={'Sales': 'Total Sales', 'City': 'City'})
fig.show()# Aggregate total sales for each hour
hourly_sales = sales_data.groupby('Hour')['Sales'].sum().reset_index().sort_values(by='Hour')
# Plot the total sales for each hour
fig = px.line(hourly_sales, x='Hour', y='Sales', title='Total Sales by Hour', labels={'Sales': 'Total Sales', 'Hour': 'Hour of the Day'})
fig.show()# Extract product categories from product names
sales_data['Category'] = sales_data['Product'].apply(lambda x: x.split()[0])
# Aggregate total sales for each category
category_sales = sales_data.groupby('Category')['Sales'].sum().reset_index().sort_values(by='Sales', ascending=False)
# Plot the total sales for each category
fig = px.bar(category_sales, x='Category', y='Sales', title='Total Sales by Product Category', labels={'Sales': 'Total Sales', 'Category': 'Product Category'})
fig.show()# Aggregate total sales for each month
monthly_sales = sales_data.groupby('Month')['Sales'].sum().reset_index().sort_values(by='Month')
# Plot the total sales for each month
fig = px.line(monthly_sales, x='Month', y='Sales', title='Total Sales by Month', labels={'Sales': 'Total Sales', 'Month': 'Month'})
fig.show()# Aggregate total sales for each city
city_sales = sales_data.groupby('City')['Sales'].sum().reset_index().sort_values(by='Sales', ascending=False)
# Plot the total sales for each city
fig = px.bar(city_sales, x='City', y='Sales', title='Total Sales by City', labels={'Sales': 'Total Sales', 'City': 'City'})
fig.show()