sklearn/examples/applications/plot_stock_market.py

"""
=======================================
Visualizing the stock market structure
=======================================

This example employs several unsupervised learning techniques to extract
the stock market structure from variations in historical quotes.

The quantity that we use is the daily variation in quote price: quotes
that are linked tend to fluctuate in relation to each other during a day.
"""

# Author: Gael Varoquaux gael.varoquaux@normalesup.org
# License: BSD 3 clause

# %%
# Retrieve the data from Internet
# -------------------------------
#
# The data is from 2003 - 2008. This is reasonably calm: (not too long ago so
# that we get high-tech firms, and before the 2008 crash). This kind of
# historical data can be obtained from APIs like the
# `data.nasdaq.com <https://data.nasdaq.com/>`_ and
# `alphavantage.co <https://www.alphavantage.co/>`_.

import sys

import numpy as np
import pandas as pd

symbol_dict = {
    "TOT": "Total",
    "XOM": "Exxon",
    "CVX": "Chevron",
    "COP": "ConocoPhillips",
    "VLO": "Valero Energy",
    "MSFT": "Microsoft",
    "IBM": "IBM",
    "TWX": "Time Warner",
    "CMCSA": "Comcast",
    "CVC": "Cablevision",
    "YHOO": "Yahoo",
    "DELL": "Dell",
    "HPQ": "HP",
    "AMZN": "Amazon",
    "TM": "Toyota",
    "CAJ": "Canon",
    "SNE": "Sony",
    "F": "Ford",
    "HMC": "Honda",
    "NAV": "Navistar",
    "NOC": "Northrop Grumman",
    "BA": "Boeing",
    "KO": "Coca Cola",
    "MMM": "3M",
    "MCD": "McDonald's",
    "PEP": "Pepsi",
    "K": "Kellogg",
    "UN": "Unilever",
    "MAR": "Marriott",
    "PG": "Procter Gamble",
    "CL": "Colgate-Palmolive",
    "GE": "General Electrics",
    "WFC": "Wells Fargo",
    "JPM": "JPMorgan Chase",
    "AIG": "AIG",
    "AXP": "American express",
    "BAC": "Bank of America",
    "GS": "Goldman Sachs",
    "AAPL": "Apple",
    "SAP": "SAP",
    "CSCO": "Cisco",
    "TXN": "Texas Instruments",
    "XRX": "Xerox",
    "WMT": "Wal-Mart",
    "HD": "Home Depot",
    "GSK": "GlaxoSmithKline",
    "PFE": "Pfizer",
    "SNY": "Sanofi-Aventis",
    "NVS": "Novartis",
    "KMB": "Kimberly-Clark",
    "R": "Ryder",
    "GD": "General Dynamics",
    "RTN": "Raytheon",
    "CVS": "CVS",
    "CAT": "Caterpillar",
    "DD": "DuPont de Nemours",
}


symbols, names = np.array(sorted(symbol_dict.items())).T

quotes = []

for symbol in symbols:
    print("Fetching quote history for %r" % symbol, file=sys.stderr)
    url = (
        "https://raw.githubusercontent.com/scikit-learn/examples-data/"
        "master/financial-data/{}.csv"
    )
    quotes.append(pd.read_csv(url.format(symbol)))

close_prices = np.vstack([q["close"] for q in quotes])
open_prices = np.vstack([q["open"] for q in quotes])

# The daily variations of the quotes are what carry the most information
variation = close_prices - open_prices

# %%
# .. _stock_market:
#
# Learning a graph structure
# --------------------------
#
# We use sparse inverse covariance estimation to find which quotes are
# correlated conditionally on the others. Specifically, sparse inverse
# covariance gives us a graph, that is a list of connections. For each
# symbol, the symbols that it is connected to are those useful to explain
# its fluctuations.

from sklearn import covariance

alphas = np.logspace(-1.5, 1, num=10)
edge_model = covariance.GraphicalLassoCV(alphas=alphas)

# standardize the time series: using correlations rather than covariance
# former is more efficient for structure recovery
X = variation.copy().T
X /= X.std(axis=0)
edge_model.fit(X)

# %%
# Clustering using affinity propagation
# -------------------------------------
#
# We use clustering to group together quotes that behave similarly. Here,
# amongst the :ref:`various clustering techniques <clustering>` available
# in the scikit-learn, we use :ref:`affinity_propagation` as it does
# not enforce equal-size clusters, and it can choose automatically the
# number of clusters from the data.
#
# Note that this gives us a different indication than the graph, as the
# graph reflects conditional relations between variables, while the
# clustering reflects marginal properties: variables clustered together can
# be considered as having a similar impact at the level of the full stock
# market.

from sklearn import cluster

_, labels = cluster.affinity_propagation(edge_model.covariance_, random_state=0)
n_labels = labels.max()

for i in range(n_labels + 1):
    print(f"Cluster {i + 1}: {', '.join(names[labels == i])}")

# %%
# Embedding in 2D space
# ---------------------
#
# For visualization purposes, we need to lay out the different symbols on a
# 2D canvas. For this we use :ref:`manifold` techniques to retrieve 2D
# embedding.
# We use a dense eigen_solver to achieve reproducibility (arpack is initiated
# with the random vectors that we don't control). In addition, we use a large
# number of neighbors to capture the large-scale structure.

# Finding a low-dimension embedding for visualization: find the best position of
# the nodes (the stocks) on a 2D plane

from sklearn import manifold

node_position_model = manifold.LocallyLinearEmbedding(
    n_components=2, eigen_solver="dense", n_neighbors=6
)

embedding = node_position_model.fit_transform(X.T).T

# %%
# Visualization
# -------------
#
# The output of the 3 models are combined in a 2D graph where nodes
# represents the stocks and edges the:
#
# - cluster labels are used to define the color of the nodes
# - the sparse covariance model is used to display the strength of the edges
# - the 2D embedding is used to position the nodes in the plan
#
# This example has a fair amount of visualization-related code, as
# visualization is crucial here to display the graph. One of the challenge
# is to position the labels minimizing overlap. For this we use an
# heuristic based on the direction of the nearest neighbor along each
# axis.

import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection

plt.figure(1, facecolor="w", figsize=(10, 8))
plt.clf()
ax = plt.axes([0.0, 0.0, 1.0, 1.0])
plt.axis("off")

# Plot the graph of partial correlations
partial_correlations = edge_model.precision_.copy()
d = 1 / np.sqrt(np.diag(partial_correlations))
partial_correlations *= d
partial_correlations *= d[:, np.newaxis]
non_zero = np.abs(np.triu(partial_correlations, k=1)) > 0.02

# Plot the nodes using the coordinates of our embedding
plt.scatter(
    embedding[0], embedding[1], s=100 * d**2, c=labels, cmap=plt.cm.nipy_spectral
)

# Plot the edges
start_idx, end_idx = np.where(non_zero)
# a sequence of (*line0*, *line1*, *line2*), where::
#            linen = (x0, y0), (x1, y1), ... (xm, ym)
segments = [
    [embedding[:, start], embedding[:, stop]] for start, stop in zip(start_idx, end_idx)
]
values = np.abs(partial_correlations[non_zero])
lc = LineCollection(
    segments, zorder=0, cmap=plt.cm.hot_r, norm=plt.Normalize(0, 0.7 * values.max())
)
lc.set_array(values)
lc.set_linewidths(15 * values)
ax.add_collection(lc)

# Add a label to each node. The challenge here is that we want to
# position the labels to avoid overlap with other labels
for index, (name, label, (x, y)) in enumerate(zip(names, labels, embedding.T)):
    dx = x - embedding[0]
    dx[index] = 1
    dy = y - embedding[1]
    dy[index] = 1
    this_dx = dx[np.argmin(np.abs(dy))]
    this_dy = dy[np.argmin(np.abs(dx))]
    if this_dx > 0:
        horizontalalignment = "left"
        x = x + 0.002
    else:
        horizontalalignment = "right"
        x = x - 0.002
    if this_dy > 0:
        verticalalignment = "bottom"
        y = y + 0.002
    else:
        verticalalignment = "top"
        y = y - 0.002
    plt.text(
        x,
        y,
        name,
        size=10,
        horizontalalignment=horizontalalignment,
        verticalalignment=verticalalignment,
        bbox=dict(
            facecolor="w",
            edgecolor=plt.cm.nipy_spectral(label / float(n_labels)),
            alpha=0.6,
        ),
    )

plt.xlim(
    embedding[0].min() - 0.15 * np.ptp(embedding[0]),
    embedding[0].max() + 0.10 * np.ptp(embedding[0]),
)
plt.ylim(
    embedding[1].min() - 0.03 * np.ptp(embedding[1]),
    embedding[1].max() + 0.03 * np.ptp(embedding[1]),
)

plt.show()
first commit 2024-08-05 09:32:03 +02:00			`"""`
			`=======================================`
			`Visualizing the stock market structure`
			`=======================================`

			`This example employs several unsupervised learning techniques to extract`
			`the stock market structure from variations in historical quotes.`

			`The quantity that we use is the daily variation in quote price: quotes`
			`that are linked tend to fluctuate in relation to each other during a day.`
			`"""`

			`# Author: Gael Varoquaux gael.varoquaux@normalesup.org`
			`# License: BSD 3 clause`

			`# %%`
			`# Retrieve the data from Internet`
			`# -------------------------------`
			`#`
			`# The data is from 2003 - 2008. This is reasonably calm: (not too long ago so`
			`# that we get high-tech firms, and before the 2008 crash). This kind of`
			`# historical data can be obtained from APIs like the`
			# `data.nasdaq.com <https://data.nasdaq.com/>`_ and
			# `alphavantage.co <https://www.alphavantage.co/>`_.

			`import sys`

			`import numpy as np`
			`import pandas as pd`

			`symbol_dict = {`
			`"TOT": "Total",`
			`"XOM": "Exxon",`
			`"CVX": "Chevron",`
			`"COP": "ConocoPhillips",`
			`"VLO": "Valero Energy",`
			`"MSFT": "Microsoft",`
			`"IBM": "IBM",`
			`"TWX": "Time Warner",`
			`"CMCSA": "Comcast",`
			`"CVC": "Cablevision",`
			`"YHOO": "Yahoo",`
			`"DELL": "Dell",`
			`"HPQ": "HP",`
			`"AMZN": "Amazon",`
			`"TM": "Toyota",`
			`"CAJ": "Canon",`
			`"SNE": "Sony",`
			`"F": "Ford",`
			`"HMC": "Honda",`
			`"NAV": "Navistar",`
			`"NOC": "Northrop Grumman",`
			`"BA": "Boeing",`
			`"KO": "Coca Cola",`
			`"MMM": "3M",`
			`"MCD": "McDonald's",`
			`"PEP": "Pepsi",`
			`"K": "Kellogg",`
			`"UN": "Unilever",`
			`"MAR": "Marriott",`
			`"PG": "Procter Gamble",`
			`"CL": "Colgate-Palmolive",`
			`"GE": "General Electrics",`
			`"WFC": "Wells Fargo",`
			`"JPM": "JPMorgan Chase",`
			`"AIG": "AIG",`
			`"AXP": "American express",`
			`"BAC": "Bank of America",`
			`"GS": "Goldman Sachs",`
			`"AAPL": "Apple",`
			`"SAP": "SAP",`
			`"CSCO": "Cisco",`
			`"TXN": "Texas Instruments",`
			`"XRX": "Xerox",`
			`"WMT": "Wal-Mart",`
			`"HD": "Home Depot",`
			`"GSK": "GlaxoSmithKline",`
			`"PFE": "Pfizer",`
			`"SNY": "Sanofi-Aventis",`
			`"NVS": "Novartis",`
			`"KMB": "Kimberly-Clark",`
			`"R": "Ryder",`
			`"GD": "General Dynamics",`
			`"RTN": "Raytheon",`
			`"CVS": "CVS",`
			`"CAT": "Caterpillar",`
			`"DD": "DuPont de Nemours",`
			`}`


			`symbols, names = np.array(sorted(symbol_dict.items())).T`

			`quotes = []`

			`for symbol in symbols:`
			`print("Fetching quote history for %r" % symbol, file=sys.stderr)`
			`url = (`
			`"https://raw.githubusercontent.com/scikit-learn/examples-data/"`
			`"master/financial-data/{}.csv"`
			`)`
			`quotes.append(pd.read_csv(url.format(symbol)))`

			`close_prices = np.vstack([q["close"] for q in quotes])`
			`open_prices = np.vstack([q["open"] for q in quotes])`

			`# The daily variations of the quotes are what carry the most information`
			`variation = close_prices - open_prices`

			`# %%`
			`# .. _stock_market:`
			`#`
			`# Learning a graph structure`
			`# --------------------------`
			`#`
			`# We use sparse inverse covariance estimation to find which quotes are`
			`# correlated conditionally on the others. Specifically, sparse inverse`
			`# covariance gives us a graph, that is a list of connections. For each`
			`# symbol, the symbols that it is connected to are those useful to explain`
			`# its fluctuations.`

			`from sklearn import covariance`

			`alphas = np.logspace(-1.5, 1, num=10)`
			`edge_model = covariance.GraphicalLassoCV(alphas=alphas)`

			`# standardize the time series: using correlations rather than covariance`
			`# former is more efficient for structure recovery`
			`X = variation.copy().T`
			`X /= X.std(axis=0)`
			`edge_model.fit(X)`

			`# %%`
			`# Clustering using affinity propagation`
			`# -------------------------------------`
			`#`
			`# We use clustering to group together quotes that behave similarly. Here,`
			# amongst the :ref:`various clustering techniques <clustering>` available
			# in the scikit-learn, we use :ref:`affinity_propagation` as it does
			`# not enforce equal-size clusters, and it can choose automatically the`
			`# number of clusters from the data.`
			`#`
			`# Note that this gives us a different indication than the graph, as the`
			`# graph reflects conditional relations between variables, while the`
			`# clustering reflects marginal properties: variables clustered together can`
			`# be considered as having a similar impact at the level of the full stock`
			`# market.`

			`from sklearn import cluster`

			`_, labels = cluster.affinity_propagation(edge_model.covariance_, random_state=0)`
			`n_labels = labels.max()`

			`for i in range(n_labels + 1):`
			`print(f"Cluster {i + 1}: {', '.join(names[labels == i])}")`

			`# %%`
			`# Embedding in 2D space`
			`# ---------------------`
			`#`
			`# For visualization purposes, we need to lay out the different symbols on a`
			# 2D canvas. For this we use :ref:`manifold` techniques to retrieve 2D
			`# embedding.`
			`# We use a dense eigen_solver to achieve reproducibility (arpack is initiated`
			`# with the random vectors that we don't control). In addition, we use a large`
			`# number of neighbors to capture the large-scale structure.`

			`# Finding a low-dimension embedding for visualization: find the best position of`
			`# the nodes (the stocks) on a 2D plane`

			`from sklearn import manifold`

			`node_position_model = manifold.LocallyLinearEmbedding(`
			`n_components=2, eigen_solver="dense", n_neighbors=6`
			`)`

			`embedding = node_position_model.fit_transform(X.T).T`

			`# %%`
			`# Visualization`
			`# -------------`
			`#`
			`# The output of the 3 models are combined in a 2D graph where nodes`
			`# represents the stocks and edges the:`
			`#`
			`# - cluster labels are used to define the color of the nodes`
			`# - the sparse covariance model is used to display the strength of the edges`
			`# - the 2D embedding is used to position the nodes in the plan`
			`#`
			`# This example has a fair amount of visualization-related code, as`
			`# visualization is crucial here to display the graph. One of the challenge`
			`# is to position the labels minimizing overlap. For this we use an`
			`# heuristic based on the direction of the nearest neighbor along each`
			`# axis.`

			`import matplotlib.pyplot as plt`
			`from matplotlib.collections import LineCollection`

			`plt.figure(1, facecolor="w", figsize=(10, 8))`
			`plt.clf()`
			`ax = plt.axes([0.0, 0.0, 1.0, 1.0])`
			`plt.axis("off")`

			`# Plot the graph of partial correlations`
			`partial_correlations = edge_model.precision_.copy()`
			`d = 1 / np.sqrt(np.diag(partial_correlations))`
			`partial_correlations *= d`
			`partial_correlations *= d[:, np.newaxis]`
			`non_zero = np.abs(np.triu(partial_correlations, k=1)) > 0.02`

			`# Plot the nodes using the coordinates of our embedding`
			`plt.scatter(`
			`embedding[0], embedding[1], s=100 * d**2, c=labels, cmap=plt.cm.nipy_spectral`
			`)`

			`# Plot the edges`
			`start_idx, end_idx = np.where(non_zero)`
			`# a sequence of (line0, line1, line2), where::`
			`# linen = (x0, y0), (x1, y1), ... (xm, ym)`
			`segments = [`
			`[embedding[:, start], embedding[:, stop]] for start, stop in zip(start_idx, end_idx)`
			`]`
			`values = np.abs(partial_correlations[non_zero])`
			`lc = LineCollection(`
			`segments, zorder=0, cmap=plt.cm.hot_r, norm=plt.Normalize(0, 0.7 * values.max())`
			`)`
			`lc.set_array(values)`
			`lc.set_linewidths(15 * values)`
			`ax.add_collection(lc)`

			`# Add a label to each node. The challenge here is that we want to`
			`# position the labels to avoid overlap with other labels`
			`for index, (name, label, (x, y)) in enumerate(zip(names, labels, embedding.T)):`
			`dx = x - embedding[0]`
			`dx[index] = 1`
			`dy = y - embedding[1]`
			`dy[index] = 1`
			`this_dx = dx[np.argmin(np.abs(dy))]`
			`this_dy = dy[np.argmin(np.abs(dx))]`
			`if this_dx > 0:`
			`horizontalalignment = "left"`
			`x = x + 0.002`
			`else:`
			`horizontalalignment = "right"`
			`x = x - 0.002`
			`if this_dy > 0:`
			`verticalalignment = "bottom"`
			`y = y + 0.002`
			`else:`
			`verticalalignment = "top"`
			`y = y - 0.002`
			`plt.text(`
			`x,`
			`y,`
			`name,`
			`size=10,`
			`horizontalalignment=horizontalalignment,`
			`verticalalignment=verticalalignment,`
			`bbox=dict(`
			`facecolor="w",`
			`edgecolor=plt.cm.nipy_spectral(label / float(n_labels)),`
			`alpha=0.6,`
			`),`
			`)`

			`plt.xlim(`
			`embedding[0].min() - 0.15 * np.ptp(embedding[0]),`
			`embedding[0].max() + 0.10 * np.ptp(embedding[0]),`
			`)`
			`plt.ylim(`
			`embedding[1].min() - 0.03 * np.ptp(embedding[1]),`
			`embedding[1].max() + 0.03 * np.ptp(embedding[1]),`
			`)`

			`plt.show()`