275 lines
8.0 KiB
Python
275 lines
8.0 KiB
Python
|
"""
|
||
|
=======================================
|
||
|
Visualizing the stock market structure
|
||
|
=======================================
|
||
|
|
||
|
This example employs several unsupervised learning techniques to extract
|
||
|
the stock market structure from variations in historical quotes.
|
||
|
|
||
|
The quantity that we use is the daily variation in quote price: quotes
|
||
|
that are linked tend to fluctuate in relation to each other during a day.
|
||
|
"""
|
||
|
|
||
|
# Author: Gael Varoquaux gael.varoquaux@normalesup.org
|
||
|
# License: BSD 3 clause
|
||
|
|
||
|
# %%
|
||
|
# Retrieve the data from Internet
|
||
|
# -------------------------------
|
||
|
#
|
||
|
# The data is from 2003 - 2008. This is reasonably calm: (not too long ago so
|
||
|
# that we get high-tech firms, and before the 2008 crash). This kind of
|
||
|
# historical data can be obtained from APIs like the
|
||
|
# `data.nasdaq.com <https://data.nasdaq.com/>`_ and
|
||
|
# `alphavantage.co <https://www.alphavantage.co/>`_.
|
||
|
|
||
|
import sys
|
||
|
|
||
|
import numpy as np
|
||
|
import pandas as pd
|
||
|
|
||
|
symbol_dict = {
|
||
|
"TOT": "Total",
|
||
|
"XOM": "Exxon",
|
||
|
"CVX": "Chevron",
|
||
|
"COP": "ConocoPhillips",
|
||
|
"VLO": "Valero Energy",
|
||
|
"MSFT": "Microsoft",
|
||
|
"IBM": "IBM",
|
||
|
"TWX": "Time Warner",
|
||
|
"CMCSA": "Comcast",
|
||
|
"CVC": "Cablevision",
|
||
|
"YHOO": "Yahoo",
|
||
|
"DELL": "Dell",
|
||
|
"HPQ": "HP",
|
||
|
"AMZN": "Amazon",
|
||
|
"TM": "Toyota",
|
||
|
"CAJ": "Canon",
|
||
|
"SNE": "Sony",
|
||
|
"F": "Ford",
|
||
|
"HMC": "Honda",
|
||
|
"NAV": "Navistar",
|
||
|
"NOC": "Northrop Grumman",
|
||
|
"BA": "Boeing",
|
||
|
"KO": "Coca Cola",
|
||
|
"MMM": "3M",
|
||
|
"MCD": "McDonald's",
|
||
|
"PEP": "Pepsi",
|
||
|
"K": "Kellogg",
|
||
|
"UN": "Unilever",
|
||
|
"MAR": "Marriott",
|
||
|
"PG": "Procter Gamble",
|
||
|
"CL": "Colgate-Palmolive",
|
||
|
"GE": "General Electrics",
|
||
|
"WFC": "Wells Fargo",
|
||
|
"JPM": "JPMorgan Chase",
|
||
|
"AIG": "AIG",
|
||
|
"AXP": "American express",
|
||
|
"BAC": "Bank of America",
|
||
|
"GS": "Goldman Sachs",
|
||
|
"AAPL": "Apple",
|
||
|
"SAP": "SAP",
|
||
|
"CSCO": "Cisco",
|
||
|
"TXN": "Texas Instruments",
|
||
|
"XRX": "Xerox",
|
||
|
"WMT": "Wal-Mart",
|
||
|
"HD": "Home Depot",
|
||
|
"GSK": "GlaxoSmithKline",
|
||
|
"PFE": "Pfizer",
|
||
|
"SNY": "Sanofi-Aventis",
|
||
|
"NVS": "Novartis",
|
||
|
"KMB": "Kimberly-Clark",
|
||
|
"R": "Ryder",
|
||
|
"GD": "General Dynamics",
|
||
|
"RTN": "Raytheon",
|
||
|
"CVS": "CVS",
|
||
|
"CAT": "Caterpillar",
|
||
|
"DD": "DuPont de Nemours",
|
||
|
}
|
||
|
|
||
|
|
||
|
symbols, names = np.array(sorted(symbol_dict.items())).T
|
||
|
|
||
|
quotes = []
|
||
|
|
||
|
for symbol in symbols:
|
||
|
print("Fetching quote history for %r" % symbol, file=sys.stderr)
|
||
|
url = (
|
||
|
"https://raw.githubusercontent.com/scikit-learn/examples-data/"
|
||
|
"master/financial-data/{}.csv"
|
||
|
)
|
||
|
quotes.append(pd.read_csv(url.format(symbol)))
|
||
|
|
||
|
close_prices = np.vstack([q["close"] for q in quotes])
|
||
|
open_prices = np.vstack([q["open"] for q in quotes])
|
||
|
|
||
|
# The daily variations of the quotes are what carry the most information
|
||
|
variation = close_prices - open_prices
|
||
|
|
||
|
# %%
|
||
|
# .. _stock_market:
|
||
|
#
|
||
|
# Learning a graph structure
|
||
|
# --------------------------
|
||
|
#
|
||
|
# We use sparse inverse covariance estimation to find which quotes are
|
||
|
# correlated conditionally on the others. Specifically, sparse inverse
|
||
|
# covariance gives us a graph, that is a list of connections. For each
|
||
|
# symbol, the symbols that it is connected to are those useful to explain
|
||
|
# its fluctuations.
|
||
|
|
||
|
from sklearn import covariance
|
||
|
|
||
|
alphas = np.logspace(-1.5, 1, num=10)
|
||
|
edge_model = covariance.GraphicalLassoCV(alphas=alphas)
|
||
|
|
||
|
# standardize the time series: using correlations rather than covariance
|
||
|
# former is more efficient for structure recovery
|
||
|
X = variation.copy().T
|
||
|
X /= X.std(axis=0)
|
||
|
edge_model.fit(X)
|
||
|
|
||
|
# %%
|
||
|
# Clustering using affinity propagation
|
||
|
# -------------------------------------
|
||
|
#
|
||
|
# We use clustering to group together quotes that behave similarly. Here,
|
||
|
# amongst the :ref:`various clustering techniques <clustering>` available
|
||
|
# in the scikit-learn, we use :ref:`affinity_propagation` as it does
|
||
|
# not enforce equal-size clusters, and it can choose automatically the
|
||
|
# number of clusters from the data.
|
||
|
#
|
||
|
# Note that this gives us a different indication than the graph, as the
|
||
|
# graph reflects conditional relations between variables, while the
|
||
|
# clustering reflects marginal properties: variables clustered together can
|
||
|
# be considered as having a similar impact at the level of the full stock
|
||
|
# market.
|
||
|
|
||
|
from sklearn import cluster
|
||
|
|
||
|
_, labels = cluster.affinity_propagation(edge_model.covariance_, random_state=0)
|
||
|
n_labels = labels.max()
|
||
|
|
||
|
for i in range(n_labels + 1):
|
||
|
print(f"Cluster {i + 1}: {', '.join(names[labels == i])}")
|
||
|
|
||
|
# %%
|
||
|
# Embedding in 2D space
|
||
|
# ---------------------
|
||
|
#
|
||
|
# For visualization purposes, we need to lay out the different symbols on a
|
||
|
# 2D canvas. For this we use :ref:`manifold` techniques to retrieve 2D
|
||
|
# embedding.
|
||
|
# We use a dense eigen_solver to achieve reproducibility (arpack is initiated
|
||
|
# with the random vectors that we don't control). In addition, we use a large
|
||
|
# number of neighbors to capture the large-scale structure.
|
||
|
|
||
|
# Finding a low-dimension embedding for visualization: find the best position of
|
||
|
# the nodes (the stocks) on a 2D plane
|
||
|
|
||
|
from sklearn import manifold
|
||
|
|
||
|
node_position_model = manifold.LocallyLinearEmbedding(
|
||
|
n_components=2, eigen_solver="dense", n_neighbors=6
|
||
|
)
|
||
|
|
||
|
embedding = node_position_model.fit_transform(X.T).T
|
||
|
|
||
|
# %%
|
||
|
# Visualization
|
||
|
# -------------
|
||
|
#
|
||
|
# The output of the 3 models are combined in a 2D graph where nodes
|
||
|
# represents the stocks and edges the:
|
||
|
#
|
||
|
# - cluster labels are used to define the color of the nodes
|
||
|
# - the sparse covariance model is used to display the strength of the edges
|
||
|
# - the 2D embedding is used to position the nodes in the plan
|
||
|
#
|
||
|
# This example has a fair amount of visualization-related code, as
|
||
|
# visualization is crucial here to display the graph. One of the challenge
|
||
|
# is to position the labels minimizing overlap. For this we use an
|
||
|
# heuristic based on the direction of the nearest neighbor along each
|
||
|
# axis.
|
||
|
|
||
|
import matplotlib.pyplot as plt
|
||
|
from matplotlib.collections import LineCollection
|
||
|
|
||
|
plt.figure(1, facecolor="w", figsize=(10, 8))
|
||
|
plt.clf()
|
||
|
ax = plt.axes([0.0, 0.0, 1.0, 1.0])
|
||
|
plt.axis("off")
|
||
|
|
||
|
# Plot the graph of partial correlations
|
||
|
partial_correlations = edge_model.precision_.copy()
|
||
|
d = 1 / np.sqrt(np.diag(partial_correlations))
|
||
|
partial_correlations *= d
|
||
|
partial_correlations *= d[:, np.newaxis]
|
||
|
non_zero = np.abs(np.triu(partial_correlations, k=1)) > 0.02
|
||
|
|
||
|
# Plot the nodes using the coordinates of our embedding
|
||
|
plt.scatter(
|
||
|
embedding[0], embedding[1], s=100 * d**2, c=labels, cmap=plt.cm.nipy_spectral
|
||
|
)
|
||
|
|
||
|
# Plot the edges
|
||
|
start_idx, end_idx = np.where(non_zero)
|
||
|
# a sequence of (*line0*, *line1*, *line2*), where::
|
||
|
# linen = (x0, y0), (x1, y1), ... (xm, ym)
|
||
|
segments = [
|
||
|
[embedding[:, start], embedding[:, stop]] for start, stop in zip(start_idx, end_idx)
|
||
|
]
|
||
|
values = np.abs(partial_correlations[non_zero])
|
||
|
lc = LineCollection(
|
||
|
segments, zorder=0, cmap=plt.cm.hot_r, norm=plt.Normalize(0, 0.7 * values.max())
|
||
|
)
|
||
|
lc.set_array(values)
|
||
|
lc.set_linewidths(15 * values)
|
||
|
ax.add_collection(lc)
|
||
|
|
||
|
# Add a label to each node. The challenge here is that we want to
|
||
|
# position the labels to avoid overlap with other labels
|
||
|
for index, (name, label, (x, y)) in enumerate(zip(names, labels, embedding.T)):
|
||
|
dx = x - embedding[0]
|
||
|
dx[index] = 1
|
||
|
dy = y - embedding[1]
|
||
|
dy[index] = 1
|
||
|
this_dx = dx[np.argmin(np.abs(dy))]
|
||
|
this_dy = dy[np.argmin(np.abs(dx))]
|
||
|
if this_dx > 0:
|
||
|
horizontalalignment = "left"
|
||
|
x = x + 0.002
|
||
|
else:
|
||
|
horizontalalignment = "right"
|
||
|
x = x - 0.002
|
||
|
if this_dy > 0:
|
||
|
verticalalignment = "bottom"
|
||
|
y = y + 0.002
|
||
|
else:
|
||
|
verticalalignment = "top"
|
||
|
y = y - 0.002
|
||
|
plt.text(
|
||
|
x,
|
||
|
y,
|
||
|
name,
|
||
|
size=10,
|
||
|
horizontalalignment=horizontalalignment,
|
||
|
verticalalignment=verticalalignment,
|
||
|
bbox=dict(
|
||
|
facecolor="w",
|
||
|
edgecolor=plt.cm.nipy_spectral(label / float(n_labels)),
|
||
|
alpha=0.6,
|
||
|
),
|
||
|
)
|
||
|
|
||
|
plt.xlim(
|
||
|
embedding[0].min() - 0.15 * np.ptp(embedding[0]),
|
||
|
embedding[0].max() + 0.10 * np.ptp(embedding[0]),
|
||
|
)
|
||
|
plt.ylim(
|
||
|
embedding[1].min() - 0.03 * np.ptp(embedding[1]),
|
||
|
embedding[1].max() + 0.03 * np.ptp(embedding[1]),
|
||
|
)
|
||
|
|
||
|
plt.show()
|