Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions files/build_templates/sonic_debian_extension.j2
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ FILESYSTEM_ROOT_USR="$FILESYSTEM_ROOT/usr"
FILESYSTEM_ROOT_USR_SHARE="$FILESYSTEM_ROOT_USR/share"
FILESYSTEM_ROOT_USR_SHARE_SONIC="$FILESYSTEM_ROOT_USR_SHARE/sonic"
FILESYSTEM_ROOT_USR_SHARE_SONIC_TEMPLATES="$FILESYSTEM_ROOT_USR_SHARE_SONIC/templates"
FILESYSTEM_ROOT_ETC="$FILESYSTEM_ROOT/etc"
FILESYSTEM_ROOT_ETC_SONIC="$FILESYSTEM_ROOT_ETC/sonic"

clean_sys() {
sudo umount $FILESYSTEM_ROOT/sys/fs/cgroup/* \
Expand Down Expand Up @@ -182,6 +184,17 @@ sudo LANG=C chroot $FILESYSTEM_ROOT systemctl enable hostcfgd.service
sudo cp $IMAGE_CONFIGS/hostcfgd/hostcfgd $FILESYSTEM_ROOT/usr/bin/
sudo cp $IMAGE_CONFIGS/hostcfgd/*.j2 $FILESYSTEM_ROOT_USR_SHARE_SONIC_TEMPLATES/

# copy core file uploader files
sudo cp $IMAGE_CONFIGS/corefile_uploader/core_uploader.service $FILESYSTEM_ROOT/etc/systemd/system/
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it would be better to put all these into sonic-utilities. In the build image repo, we just need to install the core_uploader service.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I leave core_uploader.py & its rc file here, as they are part of .service.
Moved update_json.py to sonic-utilities.

sudo LANG=C chroot $FILESYSTEM_ROOT systemctl disable core_uploader.service
sudo cp $IMAGE_CONFIGS/corefile_uploader/core_uploader.py $FILESYSTEM_ROOT/usr/bin/
sudo cp $IMAGE_CONFIGS/corefile_uploader/update_json.py $FILESYSTEM_ROOT/usr/bin/
sudo cp $IMAGE_CONFIGS/corefile_uploader/core_analyzer.rc.json $FILESYSTEM_ROOT_ETC_SONIC/
sudo chmod og-rw $FILESYSTEM_ROOT_ETC_SONIC/core_analyzer.rc.json
sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip install azure-storage
sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip install watchdog
sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip install futures

# Copy the buffer configuration template
sudo cp $BUILD_TEMPLATES/buffers_config.j2 $FILESYSTEM_ROOT_USR_SHARE_SONIC_TEMPLATES/

Expand Down
17 changes: 17 additions & 0 deletions files/image_config/corefile_uploader/core_analyzer.rc.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"local_work": {
"core_upload": "/tmp/core_upload/"
},
"azure_sonic_core_storage": {
"account_name": "corefilecollection",
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you use template for both name and share_name?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can use /usr/bin/update_json.py to update any of the entries.

"account_key": "",
"share_name": "corefiles-root"
},
"metadata_files_in_archive": {
"version": "/etc/sonic/sonic_version.yml",
"core_info": "core_info.json"
},
"env": {
"https_proxy": ""
}
}
276 changes: 276 additions & 0 deletions files/image_config/corefile_uploader/core_uploader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,276 @@
#!/usr/bin/env python

import os
import time
import tarfile
import socket
import yaml
import json
import syslog
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
from azure.storage.file import FileService

global CORE_FILE_PATH, RC_FILE
global hostname, sonicversion, asicname, acctname, acctkey, sharename, cwd
global INIT_CWD
global log_level
global this_file

this_file = os.path.basename(__file__)

global cfg
cfg = ""

CORE_FILE_PATH = "/var/core/"
RC_FILE = "/etc/sonic/core_analyzer.rc.json"
INIT_CWD = "/tmp"

hostname = ""
sonicversion = ""
asicname = ""
acctname = ""
acctkey = ""
sharename = ""
cwd = []

HOURS_4 = (4 * 60 * 60)
PAUSE_ON_FAIL = (60 * 60)
MAX_RETRIES = 5

log_level = syslog.LOG_DEBUG

def log_msg(lvl, fname, m):
if (lvl <= log_level):
syslog.syslog(lvl, "{}: {}".format(fname, m))

if log_level == syslog.LOG_DEBUG:
print("{}: {}".format(fname, m))

def log_err(m):
log_msg(syslog.LOG_ERR, this_file, m)

def log_info(m):
log_msg(syslog.LOG_INFO, this_file, m)

def log_warn(m):
log_msg(syslog.LOG_WARNING, this_file, m)

def log_debug(m):
log_msg(syslog.LOG_DEBUG, this_file, m)


def make_new_dir(p):
os.system("rm -rf " + p)
os.system("mkdir -p " + p)

def parse_a_json(data, prefix, val):
for i in data:
if type(data[i]) == dict:
parse_a_json(data[i], prefix + (i,), val)
else:
val[prefix + (i,)] = data[i]

class config:
parsed_data = {}
cfg_data = {}

def __init__(self):
while not os.path.exists(RC_FILE):
# Wait here until service restart
log_err("Unable to retrieve Azure storage credentials")
time.sleep (HOURS_4)

with open(RC_FILE, 'r') as f:
self.parsed_data = json.load(f)
parse_a_json(self.parsed_data, (), self.cfg_data)

def get_data(self, k):
return self.cfg_data[k] if self.cfg_data.has_key(k) else ""

def get_dict(self):
return self.parsed_data

def get_core_info(self, corepath, devicename):
info = {}
info["corefname"] = os.path.basename(corepath)
info["tstamp"] = str(os.stat(corepath).st_ctime)
info["devicename"] = devicename

lpath = self.get_data(("metadata_files_in_archive", "core_info"))
f = open(lpath, "w+")
f.write(json.dumps(info, indent=4))
f.close()

return lpath


class Watcher:

def __init__(self):
self.observer = Observer()

def run(self):
event_handler = Handler()
self.observer.schedule(event_handler, CORE_FILE_PATH)
self.observer.start()
try:
while True:
time.sleep(5)
except:
self.observer.stop()
log_err("Error in watcher")

self.observer.join()

def set_env(lst):
for k in lst:
if lst[k]:
os.environ[k] = lst[k]
log_debug("set env {} = {}".format(k, lst[k]))

class Handler(FileSystemEventHandler):

@staticmethod
def init():
global hostname, sonicversion, asicname, acctname, acctkey, sharename
global cwd, cfg

cfg = config()

set_env(cfg.get_dict()["env"])

hostname = socket.gethostname()
if not hostname:
raise Exception("Failed to read hostname")

acctname = cfg.get_data(("azure_sonic_core_storage", "account_name"))
acctkey = cfg.get_data(("azure_sonic_core_storage", "account_key"))
sharename = cfg.get_data(("azure_sonic_core_storage", "share_name"))

if not acctname or not acctkey or not sharename:
while True:
# Wait here until service restart
log_err("Unable to retrieve Azure storage credentials")
time.sleep (HOURS_4)

with open("/etc/sonic/sonic_version.yml", 'r') as stream:
l = yaml.safe_load(stream)
sonicversion = l['build_version']
asicname = l['asic_type']

if not sonicversion:
raise Exception("Failed to read build_version from /etc/sonic/sonic_version.yml")

if not asicname:
raise Exception("Failed to read asic_type from /etc/sonic/sonic_version.yml")

cwd = cfg.get_data(("local_work", "core_upload")).split("/")
if not len(cwd) > 2:
raise Exception("Invalid path for core_upload. Expect a min of two elements in path")

os.chdir(INIT_CWD)

@staticmethod
def on_any_event(event):
if event.is_directory:
return None

elif event.event_type == 'created':
# Take any action here when a file is first created.
log_debug("Received create event - " + event.src_path)
Handler.handle_file(event.src_path)


@staticmethod
def wait_for_file_write_complete(path):
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a better way is to check the update time of the file and compare with current time, if it is more than 10 minutes, then it likely means the file is complete.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ctime is not create but modified time.
So I just changed it to be more safe.
Theoretically, there can't be one robust safe way, as it all depends on how the write happens.

ct_size = -1

while ct_size != os.path.getsize(path):
ct_size = os.path.getsize(path)
time.sleep(2)

time.sleep(2)
if ct_size != os.path.getsize(path):
raise Exception("Dump file creation is too slow: " + path)

log_debug("File write complete - " + path)


@staticmethod
def handle_file(path):

Handler.wait_for_file_write_complete(path)

lpath = "/".join(cwd)
make_new_dir(lpath)
os.chdir(lpath)

# Create a new archive with core & more.
metafiles = cfg.get_dict()["metadata_files_in_archive"]

fname = os.path.basename(path)
tarf_name = fname + ".tar.gz"

cfg.get_core_info(path, hostname)

tar = tarfile.open(tarf_name, "w:gz")
for e in metafiles:
tar.add(metafiles[e])
tar.add(path)
tar.close()
log_debug("Tar file for upload created: " + tarf_name)

Handler.upload_file(tarf_name, tarf_name)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is the core file removed after upload?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No.
Reasons:

  1. I am not attaching core file to ICM now, so as not to increase ICM DB size,. Hence we need it here.

  2. With core files remaining, upon service restart, it will re upload,
    a) Re-upload is not a problem, as the analyzer would understand that it is duplicate and will not file any ticket
    b) If the file is still in azure storage, it would only get re-updated and no duplicates.
    In a normal system, the service restart should be pretty seldom (only upon reboot)

  3. When s process dumps cores repeatedly, only one ticket is created. Even if we attach core file., we would attach only one. But often, we might like to see multiple dumps during core file analysis to help see a pattern.

On a side note: FYI:
5) We have an upcoming core-mgmt service from Broadcom that would limit the count of core files for a process


log_debug("File uploaded - " + path)
os.chdir(INIT_CWD)

@staticmethod
def upload_file(fname, fpath):
daemonname = fname.split(".")[0]
i = 0
fail_msg = ""

while i <= MAX_RETRIES:
try:
svc = FileService(account_name=acctname, account_key=acctkey)

l = [sonicversion, asicname, daemonname, hostname]
e = []
while len(e) != len(l):
e.append(l[len(e)])
svc.create_directory(sharename, "/".join(e))

log_debug("Remote dir created: " + "/".join(e))

svc.create_file_from_path(sharename, "/".join(l), fname, fpath)
log_debug("Remote file created: name{} path{}".format(fname, fpath))
break

except Exception as e:
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what kind of failures do we see, if it is access failure due to credential, then retry would not help.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Being a http based service, there can be many different failures. So we retry 3 times with a pause in between. If we still could not, get back to main loop and waiting for next core.

Copy link
Copy Markdown
Collaborator

@lguohan lguohan Dec 16, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you try invalid user name and password, and see what error message we will have. at least if it is account issue, we should print out some message for alert. you want differentiate the network temp error v.s. systematic error, for example no storage space, account error, so that system managers can treat those message differently.

log_err("core uploader failed: Failed during upload (" + str(e) +")")
fail_msg = str(e)
i += 1
if i >= MAX_RETRIES:
raise Exception("Failed while uploading. msg(" + fail_msg + ") after " + str(i) + " retries")
time.sleep(PAUSE_ON_FAIL)


@staticmethod
def scan():
for e in os.listdir(CORE_FILE_PATH):
fl = CORE_FILE_PATH + e
if os.path.isfile(fl):
Handler.handle_file(fl)


if __name__ == '__main__':
try:
Handler.init()
w = Watcher()
Handler.scan()
w.run()
except Exception as e:
log_err("core uploader failed: " + str(e) + " Exiting ...")

11 changes: 11 additions & 0 deletions files/image_config/corefile_uploader/core_uploader.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[Unit]
Description=Host core file uploader daemon
Requires=updategraph.service
After=updategraph.service

[Service]
Type=simple
ExecStart=/usr/bin/core_uploader.py

[Install]
WantedBy=multi-user.target
55 changes: 55 additions & 0 deletions files/image_config/corefile_uploader/update_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#! /usr/bin/env python

import os
import sys
import json
import argparse

TMP_SUFFIX = ".tmp"
BAK_SUFFIX = ".bak"

def dict_update(dst, patch):
for k in patch.keys():
if type(patch[k]) == dict:
dst[k] = dict_update(dst[k], patch[k])
else:
dst[k] = patch[k]
return dst

def do_update(rcf, patchf):
dst = {}
patch = {}

tmpf = rcf + TMP_SUFFIX
bakf = rcf + BAK_SUFFIX

with open(rcf, "r") as f:
dst = json.load(f)

with open(patchf, "r") as f:
patch = json.load(f)

dst = dict_update(dst, patch)

with open(tmpf, "w") as f:
json.dump(dst, f, indent = 4)

os.rename(rcf, bakf)
os.rename(tmpf, rcf)


def main():
parser=argparse.ArgumentParser(description="Update JSON based file")
parser.add_argument("-r", "--rc", help="JSON file to be updated")
parser.add_argument("-p", "--patch", help="JSON file holding patch")
args = parser.parse_args()

if not args.rc or not args.patch:
raise Exception("check usage")

do_update(args.rc, args.patch)

if __name__ == '__main__':
main()