2024
Singh, Sidak Pal; Adilova, Linara; Kamp, Michael; Fischer, Asja; Schölkopf, Bernhard; Hofmann, Thomas
Landscaping Linear Mode Connectivity Proceedings Article
In: ICML Workshop on High-dimensional Learning Dynamics: The Emergence of Structure and Reasoning, 2024.
BibTeX | Tags: deep learning, linear mode connectivity, theory of deep learning
@inproceedings{singh2024landscaping,
title = {Landscaping Linear Mode Connectivity},
author = {Sidak Pal Singh and Linara Adilova and Michael Kamp and Asja Fischer and Bernhard Schölkopf and Thomas Hofmann},
year = {2024},
date = {2024-09-01},
urldate = {2024-09-01},
booktitle = {ICML Workshop on High-dimensional Learning Dynamics: The Emergence of Structure and Reasoning},
keywords = {deep learning, linear mode connectivity, theory of deep learning},
pubstate = {published},
tppubtype = {inproceedings}
}

Adilova, Linara; Andriushchenko, Maksym; Fischer, Michael Kamp Asja; Jaggi, Martin
Layer-wise Linear Mode Connectivity Proceedings Article
In: International Conference on Learning Representations (ICLR), Curran Associates, Inc, 2024.
Abstract | Links | BibTeX | Tags: deep learning, layer-wise, linear mode connectivity
@inproceedings{adilova2024layerwise,
title = {Layer-wise Linear Mode Connectivity},
author = {Linara Adilova and Maksym Andriushchenko and Michael Kamp Asja Fischer and Martin Jaggi},
url = {https://openreview.net/pdf?id=LfmZh91tDI},
year = {2024},
date = {2024-05-07},
urldate = {2024-05-07},
booktitle = {International Conference on Learning Representations (ICLR)},
publisher = {Curran Associates, Inc},
abstract = {Averaging neural network parameters is an intuitive method for fusing the knowledge of two independent models. It is most prominently used in federated learning. If models are averaged at the end of training, this can only lead to a good performing model if the loss surface of interest is very particular, i.e., the loss in the exact middle between the two models needs to be sufficiently low. This is impossible to guarantee for the non-convex losses of state-of-the-art networks. For averaging models trained on vastly different datasets, it was proposed to average only the parameters of particular layers or combinations of layers, resulting in better performing models. To get a better understanding of the effect of layer-wise averaging, we analyse the performance of the models that result from averaging single layers, or groups of layers. Based on our empirical and theoretical investigation, we introduce a novel notion of the layer-wise linear connectivity, and show that deep networks do not have layer-wise barriers between them. We analyze additionally the layer-wise personalization averaging and conjecture that in particular problem setup all the partial aggregations result in the approximately same performance.},
keywords = {deep learning, layer-wise, linear mode connectivity},
pubstate = {published},
tppubtype = {inproceedings}
}
Averaging neural network parameters is an intuitive method for fusing the knowledge of two independent models. It is most prominently used in federated learning. If models are averaged at the end of training, this can only lead to a good performing model if the loss surface of interest is very particular, i.e., the loss in the exact middle between the two models needs to be sufficiently low. This is impossible to guarantee for the non-convex losses of state-of-the-art networks. For averaging models trained on vastly different datasets, it was proposed to average only the parameters of particular layers or combinations of layers, resulting in better performing models. To get a better understanding of the effect of layer-wise averaging, we analyse the performance of the models that result from averaging single layers, or groups of layers. Based on our empirical and theoretical investigation, we introduce a novel notion of the layer-wise linear connectivity, and show that deep networks do not have layer-wise barriers between them. We analyze additionally the layer-wise personalization averaging and conjecture that in particular problem setup all the partial aggregations result in the approximately same performance.
